Added setting saving for exllama and exllamav2

2025-06-05 21:59:24 +02:00 · 2023-10-12 21:04:00 -04:00
parent 0688ba4fcd
commit d5dd24a330
2 changed files with 55 additions and 8 deletions
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -375,6 +375,12 @@ class model_backend(InferenceModel):
        return tokenizer

    def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+        saved_data = {'layers': [], 'max_ctx': 2048, 'compress_emb': 1, 'ntk_alpha': 1}
+        if os.path.exists("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
+            with open("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
+                temp = json.load(f)
+                for key in temp:
+                    saved_data[key] = temp[key]
        requested_parameters = []
        gpu_count = torch.cuda.device_count()
        layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
@@ -401,7 +407,7 @@ class model_backend(InferenceModel):
                                            "step": 1,
                                            "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="},
                                            "check_message": "The sum of assigned layers must equal {}".format(layer_count),
-                                            "default": [layer_count if i == 0 else 0],
+                                            "default": saved_data['layers'][i] if len(saved_data['layers']) > i else layer_count if i==0 else 0,
                                            "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)),
                                            "menu_path": "Layers",
                                            "extra_classes": "",
@@ -416,7 +422,7 @@ class model_backend(InferenceModel):
            "min": 2048,
            "max": 16384,
            "step": 512,
-            "default": 2048,
+            "default": saved_data['max_ctx'],
            "tooltip": "The maximum context size the model supports",
            "menu_path": "Configuration",
            "extra_classes": "",
@@ -431,7 +437,7 @@ class model_backend(InferenceModel):
            "min": 1,
            "max": 8,
            "step": 0.25,
-            "default": 1,
+            "default": saved_data['compress_emb'],
            "tooltip": "If the model requires compressed embeddings, set them here",
            "menu_path": "Configuration",
            "extra_classes": "",
@@ -446,7 +452,7 @@ class model_backend(InferenceModel):
            "min": 1,
            "max": 32,
            "step": 0.25,
-            "default": 1,
+            "default": saved_data['ntk_alpha'],
            "tooltip": "NTK alpha value",
            "menu_path": "Configuration",
            "extra_classes": "",
@@ -491,3 +497,21 @@ class model_backend(InferenceModel):

        self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
        self.path = parameters['path'] if 'path' in parameters else None
+
+    def _save_settings(self):
+        with open(
+            "settings/{}.exllama.model_backend.settings".format(
+                self.model_name.replace("/", "_")
+            ),
+            "w",
+        ) as f:
+            json.dump(
+                {
+                    "layers": self.layers if "layers" in vars(self) else [],
+                    "max_ctx": self.model_config.max_seq_len,
+                    "compress_emb": self.model_config.compress_pos_emb,
+                    "ntk_alpha": self.model_config.alpha_value
+                },
+                f,
+                indent="",
+            )
--- a/modeling/inference_models/exllamav2/class.py
+++ b/modeling/inference_models/exllamav2/class.py
@@ -351,6 +351,12 @@ class model_backend(InferenceModel):
        return tokenizer

    def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+        saved_data = {'max_ctx': 2048, 'compress_emb': 1, 'ntk_alpha': 1}
+        if os.path.exists("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
+            with open("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
+                temp = json.load(f)
+                for key in temp:
+                    saved_data[key] = temp[key]
        requested_parameters = []
        gpu_count = torch.cuda.device_count()
        layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
@@ -363,7 +369,7 @@ class model_backend(InferenceModel):
            "min": 2048,
            "max": 16384,
            "step": 512,
-            "default": 2048,
+            "default": saved_data['max_ctx'],
            "tooltip": "The maximum context size the model supports",
            "menu_path": "Configuration",
            "extra_classes": "",
@@ -378,7 +384,7 @@ class model_backend(InferenceModel):
            "min": 1,
            "max": 8,
            "step": 0.25,
-            "default": 1,
+            "default": saved_data['compress_emb'],
            "tooltip": "If the model requires compressed embeddings, set them here",
            "menu_path": "Configuration",
            "extra_classes": "",
@@ -393,7 +399,7 @@ class model_backend(InferenceModel):
            "min": 1,
            "max": 32,
            "step": 0.25,
-            "default": 1,
+            "default": saved_data['ntk_alpha'],
            "tooltip": "NTK alpha value",
            "menu_path": "Configuration",
            "extra_classes": "",
@@ -420,4 +426,21 @@ class model_backend(InferenceModel):
            self.model_config.sdp_thd = 0

        self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
-        self.path = parameters['path'] if 'path' in parameters else None
+        self.path = parameters['path'] if 'path' in parameters else None
+        
+    def _save_settings(self):
+        with open(
+            "settings/{}.exllamav2.model_backend.settings".format(
+                self.model_name.replace("/", "_")
+            ),
+            "w",
+        ) as f:
+            json.dump(
+                {
+                    "max_ctx": self.model_config.max_seq_len,
+                    "compress_emb": self.model_config.compress_pos_emb,
+                    "ntk_alpha": self.model_config.alpha_value
+                },
+                f,
+                indent="",
+            )