From 01429130602841d0d85ab27f6b38a4f63be0372c Mon Sep 17 00:00:00 2001 From: Nick Perez Date: Tue, 18 Jul 2023 23:29:38 -0400 Subject: [PATCH 01/11] 8 bit toggle, fix for broken toggle values --- .../generic_hf_torch/class.py | 21 +++++++++++++++++++ static/koboldai.js | 4 +--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 0bb954e3..49c6ca33 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -35,6 +35,17 @@ class model_backend(HFTorchInferenceModel): temp = json.load(f) else: temp = {} + requested_parameters.append({ + "uitype": "toggle", + "unit": "bool", + "label": "Use 8-bit", + "id": "use_8_bit", + "default": temp['use_8_bit'] if 'use_8_bit' in temp else False, + "tooltip": "Whether or not to use BnB's 8-bit mode", + "menu_path": "Layers", + "extra_classes": "", + "refresh_model_inputs": False + }) requested_parameters.append({ "uitype": "toggle", "unit": "bool", @@ -53,6 +64,7 @@ class model_backend(HFTorchInferenceModel): def set_input_parameters(self, parameters): super().set_input_parameters(parameters) self.use_4_bit = parameters['use_4_bit'] if 'use_4_bit' in parameters else False + self.use_8_bit = parameters['use_8_bit'] if 'use_8_bit' in parameters else False def _load(self, save_model: bool, initial_load: bool) -> None: utils.koboldai_vars.allowsp = True @@ -82,6 +94,14 @@ class model_backend(HFTorchInferenceModel): "low_cpu_mem_usage": True, } + if self.use_8_bit: + tf_kwargs.update({ + "quantization_config":BitsAndBytesConfig( + load_in_8bit=True, + llm_int8_enable_fp32_cpu_offload=True + ), + }) + if self.use_4_bit or utils.koboldai_vars.colab_arg: tf_kwargs.update({ "quantization_config":BitsAndBytesConfig( @@ -298,6 +318,7 @@ class model_backend(HFTorchInferenceModel): if "disk_layers" in vars(self) else 0, "use_4_bit": self.use_4_bit, + "use_8_bit": self.use_8_bit, }, f, indent="", diff --git a/static/koboldai.js b/static/koboldai.js index 94ac6ce4..8b70dd6a 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -2011,7 +2011,7 @@ function load_model() { data = {} if (settings_area) { for (const element of settings_area.querySelectorAll(".model_settings_input:not(.hidden)")) { - var element_data = element.value; + var element_data = element.getAttribute("data_type") === "bool" ? element.checked : element.value; if ((element.tagName == "SELECT") && (element.multiple)) { element_data = []; for (var i=0, iLen=element.options.length; i Date: Wed, 19 Jul 2023 07:58:12 -0400 Subject: [PATCH 02/11] feature(load model): select control for quantization level --- .../generic_hf_torch/class.py | 34 ++++++------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 49c6ca33..25d49214 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -36,24 +36,14 @@ class model_backend(HFTorchInferenceModel): else: temp = {} requested_parameters.append({ - "uitype": "toggle", - "unit": "bool", - "label": "Use 8-bit", - "id": "use_8_bit", - "default": temp['use_8_bit'] if 'use_8_bit' in temp else False, - "tooltip": "Whether or not to use BnB's 8-bit mode", - "menu_path": "Layers", - "extra_classes": "", - "refresh_model_inputs": False - }) - requested_parameters.append({ - "uitype": "toggle", - "unit": "bool", - "label": "Use 4-bit", - "id": "use_4_bit", - "default": temp['use_4_bit'] if 'use_4_bit' in temp else False, - "tooltip": "Whether or not to use BnB's 4-bit mode", + "uitype": "dropdown", + "unit": "text", + "label": "Quantization", + "id": "quantization", + "default": temp['quantization'] if 'quantization' in temp else 'none', + "tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode", "menu_path": "Layers", + "children": [{'text': 'None', 'value':'none'},{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}], "extra_classes": "", "refresh_model_inputs": False }) @@ -63,8 +53,7 @@ class model_backend(HFTorchInferenceModel): def set_input_parameters(self, parameters): super().set_input_parameters(parameters) - self.use_4_bit = parameters['use_4_bit'] if 'use_4_bit' in parameters else False - self.use_8_bit = parameters['use_8_bit'] if 'use_8_bit' in parameters else False + self.quantization = parameters['quantization'] if 'quantization' in parameters else False def _load(self, save_model: bool, initial_load: bool) -> None: utils.koboldai_vars.allowsp = True @@ -94,7 +83,7 @@ class model_backend(HFTorchInferenceModel): "low_cpu_mem_usage": True, } - if self.use_8_bit: + if self.quantization == "8bit": tf_kwargs.update({ "quantization_config":BitsAndBytesConfig( load_in_8bit=True, @@ -102,7 +91,7 @@ class model_backend(HFTorchInferenceModel): ), }) - if self.use_4_bit or utils.koboldai_vars.colab_arg: + if self.quantization == "4bit" or utils.koboldai_vars.colab_arg: tf_kwargs.update({ "quantization_config":BitsAndBytesConfig( load_in_4bit=True, @@ -317,8 +306,7 @@ class model_backend(HFTorchInferenceModel): "disk_layers": self.disk_layers if "disk_layers" in vars(self) else 0, - "use_4_bit": self.use_4_bit, - "use_8_bit": self.use_8_bit, + "quantization": self.quantization, }, f, indent="", From ec745d8b80a86704f299da68ca385e51228fda80 Mon Sep 17 00:00:00 2001 From: Henk Date: Fri, 21 Jul 2023 16:25:32 +0200 Subject: [PATCH 03/11] Dont accidentally block pad tokens --- modeling/inference_models/hf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index e407f5b4..9e7f2c9c 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -326,6 +326,8 @@ class HFInferenceModel(InferenceModel): if any(c in str(k) for c in "[]") ] + self.badwordsids.remove([self.tokenizer.pad_token_id]) + if utils.koboldai_vars.newlinemode == "n": self.badwordsids.append([self.tokenizer.eos_token_id]) From 432cdc9a086091718928c71343d0393ff0a890e5 Mon Sep 17 00:00:00 2001 From: Henk Date: Fri, 21 Jul 2023 16:39:58 +0200 Subject: [PATCH 04/11] Fix models with good pad tokens --- modeling/inference_models/hf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 9e7f2c9c..661840a4 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -326,7 +326,10 @@ class HFInferenceModel(InferenceModel): if any(c in str(k) for c in "[]") ] - self.badwordsids.remove([self.tokenizer.pad_token_id]) + try: + self.badwordsids.remove([self.tokenizer.pad_token_id]) + except: + pass if utils.koboldai_vars.newlinemode == "n": self.badwordsids.append([self.tokenizer.eos_token_id]) From da9b54ec1ca32d828ba5a4a73f700ff7cb81cc98 Mon Sep 17 00:00:00 2001 From: Henk Date: Fri, 21 Jul 2023 19:31:38 +0200 Subject: [PATCH 05/11] Don't show API link during load --- aiserver.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/aiserver.py b/aiserver.py index 0aa9bd4c..aa305664 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1894,6 +1894,7 @@ def load_model(model_backend, initial_load=False): logger.message(f"KoboldAI has finished loading and is available at the following link for KoboldAI Lite: {koboldai_vars.cloudflare_link}/lite") logger.message(f"KoboldAI has finished loading and is available at the following link for the API: {koboldai_vars.cloudflare_link}/api") + # Setup IP Whitelisting # Define a function to check if IP is allowed def is_allowed_ip(): @@ -10901,13 +10902,14 @@ def run(): with open('cloudflare.log', 'w') as cloudflarelog: cloudflarelog.write("KoboldAI is available at the following link : " + cloudflare) logger.init_ok("Webserver", status="OK") - if not koboldai_vars.use_colab_tpu: + if not koboldai_vars.use_colab_tpu and args.model: # If we're using a TPU our UI will freeze during the connection to the TPU. To prevent this from showing to the user we # delay the display of this message until after that step - logger.message(f"KoboldAI is available at the following link for UI 1: {cloudflare}") - logger.message(f"KoboldAI is available at the following link for UI 2: {cloudflare}/new_ui") - logger.message(f"KoboldAI is available at the following link for KoboldAI Lite: {cloudflare}/lite") - logger.message(f"KoboldAI is available at the following link for the API: {cloudflare}/api") + logger.message(f"KoboldAI is still loading your model but available at the following link for UI 1: {cloudflare}") + logger.message(f"KoboldAI is still loading your model but available at the following link for UI 2: {cloudflare}/new_ui") + logger.message(f"KoboldAI is still loading your model but available at the following link for KoboldAI Lite: {cloudflare}/lite") + logger.message(f"KoboldAI is still loading your model but available at the following link for the API: [Pending Model]") + logger.message(f"While the model loads you can use the above links to begin setting up your session, for generations you must wait until after its done loading.") else: logger.init_ok("Webserver", status="OK") logger.message(f"Webserver has started, you can now connect to this machine at port: {port}") From a17d7aae6031f34376094619b1ec4a92eb331a38 Mon Sep 17 00:00:00 2001 From: Henk Date: Fri, 21 Jul 2023 19:42:49 +0200 Subject: [PATCH 06/11] Easier english --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index aa305664..dc565c97 100644 --- a/aiserver.py +++ b/aiserver.py @@ -10908,7 +10908,7 @@ def run(): logger.message(f"KoboldAI is still loading your model but available at the following link for UI 1: {cloudflare}") logger.message(f"KoboldAI is still loading your model but available at the following link for UI 2: {cloudflare}/new_ui") logger.message(f"KoboldAI is still loading your model but available at the following link for KoboldAI Lite: {cloudflare}/lite") - logger.message(f"KoboldAI is still loading your model but available at the following link for the API: [Pending Model]") + logger.message(f"KoboldAI is still loading your model but available at the following link for the API: [Loading Model...]") logger.message(f"While the model loads you can use the above links to begin setting up your session, for generations you must wait until after its done loading.") else: logger.init_ok("Webserver", status="OK") From e68972a270f87809c78da7f49141f5a1352db471 Mon Sep 17 00:00:00 2001 From: somebody Date: Fri, 21 Jul 2023 16:14:13 -0500 Subject: [PATCH 07/11] Fix WI comments --- static/koboldai.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/static/koboldai.js b/static/koboldai.js index 8b70dd6a..dc616b19 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -2408,12 +2408,12 @@ function world_info_entry(data) { comment.setAttribute("uid", data.uid); comment.value = data.comment; comment.onchange = function () { - world_info_data[this.getAttribute('uid')]['comment'] = this.textContent; - send_world_info(this.getAttribute('uid')); + world_info_data[data.uid].comment = this.value; + send_world_info(data.uid); this.classList.add("pulse"); } comment.classList.remove("pulse"); - + //Let's figure out the order to insert this card var found = false; var moved = false; From 7823da564e6558ddcdb69ecb3eb8e2d44fc428f6 Mon Sep 17 00:00:00 2001 From: Henk Date: Sat, 22 Jul 2023 04:04:17 +0200 Subject: [PATCH 08/11] Link to Lite --- templates/settings flyout.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/settings flyout.html b/templates/settings flyout.html index 756fe07d..e394601d 100644 --- a/templates/settings flyout.html +++ b/templates/settings flyout.html @@ -19,8 +19,8 @@ Home Settings Interface - - Help + + Lite open_in_new From fa9d17b3d38a31574453edc781890ce333477fd0 Mon Sep 17 00:00:00 2001 From: Henk Date: Sat, 22 Jul 2023 15:25:14 +0200 Subject: [PATCH 09/11] HF 4.31 --- environments/huggingface.yml | 2 +- environments/rocm.yml | 2 +- modeling/inference_models/generic_hf_torch/class.py | 1 + requirements.txt | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index ef504adc..3ce1034b 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -32,7 +32,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers==4.30.1 + - transformers==4.31.0 - huggingface_hub==0.15.1 - safetensors==0.3.1 - accelerate==0.20.3 diff --git a/environments/rocm.yml b/environments/rocm.yml index 7099474d..6d5ba821 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -30,7 +30,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers==4.30.1 + - transformers==4.31.0 - huggingface_hub==0.15.1 - safetensors==0.3.1 - accelerate==0.20.3 diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 25d49214..1cc1a373 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -81,6 +81,7 @@ class model_backend(HFTorchInferenceModel): tf_kwargs = { "low_cpu_mem_usage": True, + "pretraining_tp": 1, } if self.quantization == "8bit": diff --git a/requirements.txt b/requirements.txt index 2e724555..a3920d6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -transformers==4.30.* +transformers==4.31.* huggingface_hub==0.15.1 Flask==2.2.3 Flask-SocketIO==5.3.2 From 8dd7b93a6c94d0fcd8dede9b6d3bb743c7f20369 Mon Sep 17 00:00:00 2001 From: Henk Date: Sat, 22 Jul 2023 16:29:55 +0200 Subject: [PATCH 10/11] HF's workaround breaks stuff --- modeling/inference_models/generic_hf_torch/class.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 1cc1a373..25d49214 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -81,7 +81,6 @@ class model_backend(HFTorchInferenceModel): tf_kwargs = { "low_cpu_mem_usage": True, - "pretraining_tp": 1, } if self.quantization == "8bit": From 7a5d813b92a0572f62bf9fd0586626be16c5b8fa Mon Sep 17 00:00:00 2001 From: Henk Date: Sat, 22 Jul 2023 16:59:49 +0200 Subject: [PATCH 11/11] Reimplement HF workaround only for llama --- modeling/inference_models/generic_hf_torch/class.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 25d49214..e271ca5c 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -110,6 +110,11 @@ class model_backend(HFTorchInferenceModel): # Also, lazy loader doesn't support GPT-2 models self.lazy_load = False + if self.model_type == "llama": + tf_kwargs.update({ + "pretraining_tp": 1 # Workaround recommended by HF to fix their mistake on the config.json tuners adopted + }) + logger.debug( "lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format( self.lazy_load,