From 3dd0e91fbb82a7fd16091abbfbb6447492f08d9a Mon Sep 17 00:00:00 2001 From: Henk Date: Mon, 21 Aug 2023 01:58:52 +0200 Subject: [PATCH] Preliminary HF GPTQ changes --- .../generic_hf_torch/class.py | 60 ++++++++++--------- .../inference_models/gptq_hf_torch/class.py | 2 +- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index a7734e7d..9b1049cf 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -57,18 +57,19 @@ class model_backend(HFTorchInferenceModel): temp = json.load(f) else: temp = {} - requested_parameters.append({ - "uitype": "dropdown", - "unit": "text", - "label": "Quantization", - "id": "quantization", - "default": temp['quantization'] if 'quantization' in temp else '4bit' if dependency_exists else '16-bit', - "tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode", - "menu_path": "Layers", - "children": [{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}, {'text': '16-bit', 'value':'16-bit'}], - "extra_classes": "", - "refresh_model_inputs": False - }) + if not hasattr(self.model_config, 'quantization_config'): + requested_parameters.append({ + "uitype": "dropdown", + "unit": "text", + "label": "Quantization", + "id": "quantization", + "default": temp['quantization'] if 'quantization' in temp else '4bit' if dependency_exists else '16-bit', + "tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode", + "menu_path": "Layers", + "children": [{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}, {'text': '16-bit', 'value':'16-bit'}], + "extra_classes": "", + "refresh_model_inputs": False + }) else: logger.warning("Bitsandbytes is not installed, you can not use Quantization for Huggingface models") return requested_parameters @@ -105,24 +106,25 @@ class model_backend(HFTorchInferenceModel): "low_cpu_mem_usage": True, } - if self.quantization == "8bit": - tf_kwargs.update({ - "quantization_config":BitsAndBytesConfig( - load_in_8bit=True, - llm_int8_enable_fp32_cpu_offload=True - ), - }) + if not hasattr(self.model_config, 'quantization_config'): + if self.quantization == "8bit": + tf_kwargs.update({ + "quantization_config":BitsAndBytesConfig( + load_in_8bit=True, + llm_int8_enable_fp32_cpu_offload=True + ), + }) - if self.quantization == "4bit" or utils.koboldai_vars.colab_arg: - tf_kwargs.update({ - "quantization_config":BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type='nf4', - llm_int8_enable_fp32_cpu_offload=True - ), - }) + if self.quantization == "4bit" or utils.koboldai_vars.colab_arg: + tf_kwargs.update({ + "quantization_config":BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4', + llm_int8_enable_fp32_cpu_offload=True + ), + }) if self.model_type == "gpt2": # We must disable low_cpu_mem_usage and if using a GPT-2 model diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index aa65a295..3d044b6f 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -21,7 +21,7 @@ from pathlib import Path model_backend_type = "GPTQ" -model_backend_name = "Huggingface GPTQ" +model_backend_name = "Legacy GPTQ" def load_model_gptq_settings(path):