From d5dd24a3303c6f85360eff5a771d8d2c102bf779 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Thu, 12 Oct 2023 21:04:00 -0400
Subject: [PATCH] Added setting saving for exllama and exllamav2

---
 modeling/inference_models/exllama/class.py   | 32 +++++++++++++++++---
 modeling/inference_models/exllamav2/class.py | 31 ++++++++++++++++---
 2 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 569f6d61..b52c2d65 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -375,6 +375,12 @@ class model_backend(InferenceModel):
         return tokenizer
 
     def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+        saved_data = {'layers': [], 'max_ctx': 2048, 'compress_emb': 1, 'ntk_alpha': 1}
+        if os.path.exists("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
+            with open("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
+                temp = json.load(f)
+                for key in temp:
+                    saved_data[key] = temp[key]
         requested_parameters = []
         gpu_count = torch.cuda.device_count()
         layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
@@ -401,7 +407,7 @@ class model_backend(InferenceModel):
                                             "step": 1,
                                             "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="},
                                             "check_message": "The sum of assigned layers must equal {}".format(layer_count),
-                                            "default": [layer_count if i == 0 else 0],
+                                            "default": saved_data['layers'][i] if len(saved_data['layers']) > i else layer_count if i==0 else 0,
                                             "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)),
                                             "menu_path": "Layers",
                                             "extra_classes": "",
@@ -416,7 +422,7 @@ class model_backend(InferenceModel):
             "min": 2048,
             "max": 16384,
             "step": 512,
-            "default": 2048,
+            "default": saved_data['max_ctx'],
             "tooltip": "The maximum context size the model supports",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -431,7 +437,7 @@ class model_backend(InferenceModel):
             "min": 1,
             "max": 8,
             "step": 0.25,
-            "default": 1,
+            "default": saved_data['compress_emb'],
             "tooltip": "If the model requires compressed embeddings, set them here",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -446,7 +452,7 @@ class model_backend(InferenceModel):
             "min": 1,
             "max": 32,
             "step": 0.25,
-            "default": 1,
+            "default": saved_data['ntk_alpha'],
             "tooltip": "NTK alpha value",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -491,3 +497,21 @@ class model_backend(InferenceModel):
 
         self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
         self.path = parameters['path'] if 'path' in parameters else None
+
+    def _save_settings(self):
+        with open(
+            "settings/{}.exllama.model_backend.settings".format(
+                self.model_name.replace("/", "_")
+            ),
+            "w",
+        ) as f:
+            json.dump(
+                {
+                    "layers": self.layers if "layers" in vars(self) else [],
+                    "max_ctx": self.model_config.max_seq_len,
+                    "compress_emb": self.model_config.compress_pos_emb,
+                    "ntk_alpha": self.model_config.alpha_value
+                },
+                f,
+                indent="",
+            )
\ No newline at end of file
diff --git a/modeling/inference_models/exllamav2/class.py b/modeling/inference_models/exllamav2/class.py
index dd97e83f..7757202f 100644
--- a/modeling/inference_models/exllamav2/class.py
+++ b/modeling/inference_models/exllamav2/class.py
@@ -351,6 +351,12 @@ class model_backend(InferenceModel):
         return tokenizer
 
     def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+        saved_data = {'max_ctx': 2048, 'compress_emb': 1, 'ntk_alpha': 1}
+        if os.path.exists("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
+            with open("settings/{}.exllama.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
+                temp = json.load(f)
+                for key in temp:
+                    saved_data[key] = temp[key]
         requested_parameters = []
         gpu_count = torch.cuda.device_count()
         layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
@@ -363,7 +369,7 @@ class model_backend(InferenceModel):
             "min": 2048,
             "max": 16384,
             "step": 512,
-            "default": 2048,
+            "default": saved_data['max_ctx'],
             "tooltip": "The maximum context size the model supports",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -378,7 +384,7 @@ class model_backend(InferenceModel):
             "min": 1,
             "max": 8,
             "step": 0.25,
-            "default": 1,
+            "default": saved_data['compress_emb'],
             "tooltip": "If the model requires compressed embeddings, set them here",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -393,7 +399,7 @@ class model_backend(InferenceModel):
             "min": 1,
             "max": 32,
             "step": 0.25,
-            "default": 1,
+            "default": saved_data['ntk_alpha'],
             "tooltip": "NTK alpha value",
             "menu_path": "Configuration",
             "extra_classes": "",
@@ -420,4 +426,21 @@ class model_backend(InferenceModel):
             self.model_config.sdp_thd = 0
 
         self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
-        self.path = parameters['path'] if 'path' in parameters else None
\ No newline at end of file
+        self.path = parameters['path'] if 'path' in parameters else None
+        
+    def _save_settings(self):
+        with open(
+            "settings/{}.exllamav2.model_backend.settings".format(
+                self.model_name.replace("/", "_")
+            ),
+            "w",
+        ) as f:
+            json.dump(
+                {
+                    "max_ctx": self.model_config.max_seq_len,
+                    "compress_emb": self.model_config.compress_pos_emb,
+                    "ntk_alpha": self.model_config.alpha_value
+                },
+                f,
+                indent="",
+            )
\ No newline at end of file