From 01429130602841d0d85ab27f6b38a4f63be0372c Mon Sep 17 00:00:00 2001
From: Nick Perez <nickjperez@gmail.com>
Date: Tue, 18 Jul 2023 23:29:38 -0400
Subject: [PATCH 01/11] 8 bit toggle, fix for broken toggle values

---
 .../generic_hf_torch/class.py                 | 21 +++++++++++++++++++
 static/koboldai.js                            |  4 +---
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py
index 0bb954e3..49c6ca33 100644
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -35,6 +35,17 @@ class model_backend(HFTorchInferenceModel):
                         temp = json.load(f)
                 else:
                     temp = {}
+                requested_parameters.append({
+                                            "uitype": "toggle",
+                                            "unit": "bool",
+                                            "label": "Use 8-bit",
+                                            "id": "use_8_bit",
+                                            "default": temp['use_8_bit'] if 'use_8_bit' in temp else False,
+                                            "tooltip": "Whether or not to use BnB's 8-bit mode",
+                                            "menu_path": "Layers",
+                                            "extra_classes": "",
+                                            "refresh_model_inputs": False
+                                        })
                 requested_parameters.append({
                                             "uitype": "toggle",
                                             "unit": "bool",
@@ -53,6 +64,7 @@ class model_backend(HFTorchInferenceModel):
     def set_input_parameters(self, parameters):
         super().set_input_parameters(parameters)
         self.use_4_bit = parameters['use_4_bit'] if 'use_4_bit' in parameters else False
+        self.use_8_bit = parameters['use_8_bit'] if 'use_8_bit' in parameters else False
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
         utils.koboldai_vars.allowsp = True
@@ -82,6 +94,14 @@ class model_backend(HFTorchInferenceModel):
             "low_cpu_mem_usage": True,
         }
         
+        if self.use_8_bit:
+            tf_kwargs.update({
+                "quantization_config":BitsAndBytesConfig(
+                    load_in_8bit=True,
+                    llm_int8_enable_fp32_cpu_offload=True
+                ),
+            })
+
         if self.use_4_bit or utils.koboldai_vars.colab_arg:
             tf_kwargs.update({
                 "quantization_config":BitsAndBytesConfig(
@@ -298,6 +318,7 @@ class model_backend(HFTorchInferenceModel):
                     if "disk_layers" in vars(self)
                     else 0,
                     "use_4_bit": self.use_4_bit,
+                    "use_8_bit": self.use_8_bit,
                 },
                 f,
                 indent="",
diff --git a/static/koboldai.js b/static/koboldai.js
index 94ac6ce4..8b70dd6a 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -2011,7 +2011,7 @@ function load_model() {
 	data = {}
 	if (settings_area) {
 		for (const element of settings_area.querySelectorAll(".model_settings_input:not(.hidden)")) {
-			var element_data = element.value;
+			var element_data = element.getAttribute("data_type") === "bool" ? element.checked : element.value;
 			if ((element.tagName == "SELECT") && (element.multiple)) {
 				element_data = [];
 				for (var i=0, iLen=element.options.length; i<iLen; i++) {
@@ -2024,8 +2024,6 @@ function load_model() {
 					element_data = parseInt(element_data);
 				} else if (element.getAttribute("data_type") == "float") {
 					element_data = parseFloat(element_data);
-				} else if (element.getAttribute("data_type") == "bool") {
-					element_data = (element_data == 'on');
 				}
 			}
 			data[element.id.split("|")[1].replace("_value", "")] = element_data;

From 9581e51476102c7a10b9f2bb37f9427f318a8e10 Mon Sep 17 00:00:00 2001
From: Nick Perez <nickjperez@gmail.com>
Date: Wed, 19 Jul 2023 07:58:12 -0400
Subject: [PATCH 02/11] feature(load model): select control for quantization
 level

---
 .../generic_hf_torch/class.py                 | 34 ++++++-------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py
index 49c6ca33..25d49214 100644
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -36,24 +36,14 @@ class model_backend(HFTorchInferenceModel):
                 else:
                     temp = {}
                 requested_parameters.append({
-                                            "uitype": "toggle",
-                                            "unit": "bool",
-                                            "label": "Use 8-bit",
-                                            "id": "use_8_bit",
-                                            "default": temp['use_8_bit'] if 'use_8_bit' in temp else False,
-                                            "tooltip": "Whether or not to use BnB's 8-bit mode",
-                                            "menu_path": "Layers",
-                                            "extra_classes": "",
-                                            "refresh_model_inputs": False
-                                        })
-                requested_parameters.append({
-                                            "uitype": "toggle",
-                                            "unit": "bool",
-                                            "label": "Use 4-bit",
-                                            "id": "use_4_bit",
-                                            "default": temp['use_4_bit'] if 'use_4_bit' in temp else False,
-                                            "tooltip": "Whether or not to use BnB's 4-bit mode",
+                                            "uitype": "dropdown",
+                                            "unit": "text",
+                                            "label": "Quantization",
+                                            "id": "quantization",
+                                            "default": temp['quantization'] if 'quantization' in temp else 'none',
+                                            "tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode",
                                             "menu_path": "Layers",
+                                            "children": [{'text': 'None', 'value':'none'},{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}],
                                             "extra_classes": "",
                                             "refresh_model_inputs": False
                                         })
@@ -63,8 +53,7 @@ class model_backend(HFTorchInferenceModel):
  
     def set_input_parameters(self, parameters):
         super().set_input_parameters(parameters)
-        self.use_4_bit = parameters['use_4_bit'] if 'use_4_bit' in parameters else False
-        self.use_8_bit = parameters['use_8_bit'] if 'use_8_bit' in parameters else False
+        self.quantization = parameters['quantization'] if 'quantization' in parameters else False
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
         utils.koboldai_vars.allowsp = True
@@ -94,7 +83,7 @@ class model_backend(HFTorchInferenceModel):
             "low_cpu_mem_usage": True,
         }
         
-        if self.use_8_bit:
+        if self.quantization == "8bit":
             tf_kwargs.update({
                 "quantization_config":BitsAndBytesConfig(
                     load_in_8bit=True,
@@ -102,7 +91,7 @@ class model_backend(HFTorchInferenceModel):
                 ),
             })
 
-        if self.use_4_bit or utils.koboldai_vars.colab_arg:
+        if self.quantization == "4bit" or utils.koboldai_vars.colab_arg:
             tf_kwargs.update({
                 "quantization_config":BitsAndBytesConfig(
                     load_in_4bit=True,
@@ -317,8 +306,7 @@ class model_backend(HFTorchInferenceModel):
                     "disk_layers": self.disk_layers
                     if "disk_layers" in vars(self)
                     else 0,
-                    "use_4_bit": self.use_4_bit,
-                    "use_8_bit": self.use_8_bit,
+                    "quantization": self.quantization,
                 },
                 f,
                 indent="",

From ec745d8b80a86704f299da68ca385e51228fda80 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Fri, 21 Jul 2023 16:25:32 +0200
Subject: [PATCH 03/11] Dont accidentally block pad tokens

---
 modeling/inference_models/hf.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index e407f5b4..9e7f2c9c 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -326,6 +326,8 @@ class HFInferenceModel(InferenceModel):
                 if any(c in str(k) for c in "[]")
             ]
 
+            self.badwordsids.remove([self.tokenizer.pad_token_id])
+            
             if utils.koboldai_vars.newlinemode == "n":
                 self.badwordsids.append([self.tokenizer.eos_token_id])
 

From 432cdc9a086091718928c71343d0393ff0a890e5 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Fri, 21 Jul 2023 16:39:58 +0200
Subject: [PATCH 04/11] Fix models with good pad tokens

---
 modeling/inference_models/hf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index 9e7f2c9c..661840a4 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -326,7 +326,10 @@ class HFInferenceModel(InferenceModel):
                 if any(c in str(k) for c in "[]")
             ]
 
-            self.badwordsids.remove([self.tokenizer.pad_token_id])
+            try:
+                self.badwordsids.remove([self.tokenizer.pad_token_id])
+            except:
+                pass
             
             if utils.koboldai_vars.newlinemode == "n":
                 self.badwordsids.append([self.tokenizer.eos_token_id])

From da9b54ec1ca32d828ba5a4a73f700ff7cb81cc98 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Fri, 21 Jul 2023 19:31:38 +0200
Subject: [PATCH 05/11] Don't show API link during load

---
 aiserver.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 0aa9bd4c..aa305664 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1894,6 +1894,7 @@ def load_model(model_backend, initial_load=False):
         logger.message(f"KoboldAI has finished loading and is available at the following link for KoboldAI Lite: {koboldai_vars.cloudflare_link}/lite")
         logger.message(f"KoboldAI has finished loading and is available at the following link for the API: {koboldai_vars.cloudflare_link}/api")
 
+
 # Setup IP Whitelisting
 # Define a function to check if IP is allowed
 def is_allowed_ip():
@@ -10901,13 +10902,14 @@ def run():
             with open('cloudflare.log', 'w') as cloudflarelog:
                 cloudflarelog.write("KoboldAI is available at the following link : " + cloudflare)
                 logger.init_ok("Webserver", status="OK")
-                if not koboldai_vars.use_colab_tpu:
+                if not koboldai_vars.use_colab_tpu and args.model:
                     # If we're using a TPU our UI will freeze during the connection to the TPU. To prevent this from showing to the user we 
                     # delay the display of this message until after that step
-                    logger.message(f"KoboldAI is available at the following link for UI 1: {cloudflare}")
-                    logger.message(f"KoboldAI is available at the following link for UI 2: {cloudflare}/new_ui")
-                    logger.message(f"KoboldAI is available at the following link for KoboldAI Lite: {cloudflare}/lite")
-                    logger.message(f"KoboldAI is available at the following link for the API: {cloudflare}/api")
+                    logger.message(f"KoboldAI is still loading your model but available at the following link for UI 1: {cloudflare}")
+                    logger.message(f"KoboldAI is still loading your model but available at the following link for UI 2: {cloudflare}/new_ui")
+                    logger.message(f"KoboldAI is still loading your model but available at the following link for KoboldAI Lite: {cloudflare}/lite")
+                    logger.message(f"KoboldAI is still loading your model but available at the following link for the API: [Pending Model]")
+                    logger.message(f"While the model loads you can use the above links to begin setting up your session, for generations you must wait until after its done loading.")
         else:
             logger.init_ok("Webserver", status="OK")
             logger.message(f"Webserver has started, you can now connect to this machine at port: {port}")

From a17d7aae6031f34376094619b1ec4a92eb331a38 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Fri, 21 Jul 2023 19:42:49 +0200
Subject: [PATCH 06/11] Easier english

---
 aiserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index aa305664..dc565c97 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -10908,7 +10908,7 @@ def run():
                     logger.message(f"KoboldAI is still loading your model but available at the following link for UI 1: {cloudflare}")
                     logger.message(f"KoboldAI is still loading your model but available at the following link for UI 2: {cloudflare}/new_ui")
                     logger.message(f"KoboldAI is still loading your model but available at the following link for KoboldAI Lite: {cloudflare}/lite")
-                    logger.message(f"KoboldAI is still loading your model but available at the following link for the API: [Pending Model]")
+                    logger.message(f"KoboldAI is still loading your model but available at the following link for the API: [Loading Model...]")
                     logger.message(f"While the model loads you can use the above links to begin setting up your session, for generations you must wait until after its done loading.")
         else:
             logger.init_ok("Webserver", status="OK")

From e68972a270f87809c78da7f49141f5a1352db471 Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Fri, 21 Jul 2023 16:14:13 -0500
Subject: [PATCH 07/11] Fix WI comments

---
 static/koboldai.js | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/static/koboldai.js b/static/koboldai.js
index 8b70dd6a..dc616b19 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -2408,12 +2408,12 @@ function world_info_entry(data) {
 	comment.setAttribute("uid", data.uid);
 	comment.value = data.comment;
 	comment.onchange = function () {
-							world_info_data[this.getAttribute('uid')]['comment'] = this.textContent;
-							send_world_info(this.getAttribute('uid'));
+							world_info_data[data.uid].comment = this.value;
+							send_world_info(data.uid);
 							this.classList.add("pulse");
 						}
 	comment.classList.remove("pulse");
-						
+
 	//Let's figure out the order to insert this card
 	var found = false;
 	var moved = false;

From 7823da564e6558ddcdb69ecb3eb8e2d44fc428f6 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Sat, 22 Jul 2023 04:04:17 +0200
Subject: [PATCH 08/11] Link to Lite

---
 templates/settings flyout.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/templates/settings flyout.html b/templates/settings flyout.html
index 756fe07d..e394601d 100644
--- a/templates/settings flyout.html	
+++ b/templates/settings flyout.html	
@@ -19,8 +19,8 @@
 	<span id="settings_flyout_tab_home" class="setting_menu_button tab tab-settings selected" tab-target="setting_menu_home" onclick="selectTab(this);">Home</span>
 	<span id="settings_flyout_tab_settings" class="setting_menu_button tab tab-settings" tab-target="setting_menu_settings" onclick="selectTab(this);">Settings</span>
 	<span id="settings_flyout_tab_interface" class="setting_menu_button tab tab-settings" tab-target="setting_menu_interface" onclick="selectTab(this);">Interface</span>
-	<span style="float: right;margin-right: 30px;padding: 0px 10px;" onclick="window.open('https://github.com/KoboldAI/KoboldAI-Client/wiki');">
-		Help
+	<span style="float: right;margin-right: 30px;padding: 0px 10px;" onclick="window.open('/lite');">
+		Lite
 		<icon class="material-icons-outlined" style="font-size:14px;position:relative;top:2px;">open_in_new</icon>
 	</span>
 </div>

From fa9d17b3d38a31574453edc781890ce333477fd0 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Sat, 22 Jul 2023 15:25:14 +0200
Subject: [PATCH 09/11] HF 4.31

---
 environments/huggingface.yml                        | 2 +-
 environments/rocm.yml                               | 2 +-
 modeling/inference_models/generic_hf_torch/class.py | 1 +
 requirements.txt                                    | 2 +-
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index ef504adc..3ce1034b 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -32,7 +32,7 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - transformers==4.30.1
+    - transformers==4.31.0
     - huggingface_hub==0.15.1
     - safetensors==0.3.1
     - accelerate==0.20.3
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 7099474d..6d5ba821 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -30,7 +30,7 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - transformers==4.30.1
+    - transformers==4.31.0
     - huggingface_hub==0.15.1
     - safetensors==0.3.1
     - accelerate==0.20.3
diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py
index 25d49214..1cc1a373 100644
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -81,6 +81,7 @@ class model_backend(HFTorchInferenceModel):
 
         tf_kwargs = {
             "low_cpu_mem_usage": True,
+            "pretraining_tp": 1,
         }
         
         if self.quantization == "8bit":
diff --git a/requirements.txt b/requirements.txt
index 2e724555..a3920d6b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-transformers==4.30.*
+transformers==4.31.*
 huggingface_hub==0.15.1
 Flask==2.2.3
 Flask-SocketIO==5.3.2

From 8dd7b93a6c94d0fcd8dede9b6d3bb743c7f20369 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Sat, 22 Jul 2023 16:29:55 +0200
Subject: [PATCH 10/11] HF's workaround breaks stuff

---
 modeling/inference_models/generic_hf_torch/class.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py
index 1cc1a373..25d49214 100644
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -81,7 +81,6 @@ class model_backend(HFTorchInferenceModel):
 
         tf_kwargs = {
             "low_cpu_mem_usage": True,
-            "pretraining_tp": 1,
         }
         
         if self.quantization == "8bit":

From 7a5d813b92a0572f62bf9fd0586626be16c5b8fa Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Sat, 22 Jul 2023 16:59:49 +0200
Subject: [PATCH 11/11] Reimplement HF workaround only for llama

---
 modeling/inference_models/generic_hf_torch/class.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py
index 25d49214..e271ca5c 100644
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -110,6 +110,11 @@ class model_backend(HFTorchInferenceModel):
             # Also, lazy loader doesn't support GPT-2 models
             self.lazy_load = False
 
+        if self.model_type == "llama":
+            tf_kwargs.update({
+                "pretraining_tp": 1 # Workaround recommended by HF to fix their mistake on the config.json tuners adopted
+            })
+        
         logger.debug(
             "lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(
                 self.lazy_load,