Merge branch 'united' of https://github.com/henk717/KoboldAI into fixing-time

2025-06-05 21:59:24 +02:00 · 2023-08-07 16:22:04 -05:00
parent fa0a099943 824050471b
commit 906d1f2522
30 changed files with 1940 additions and 807 deletions
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -6,7 +6,13 @@ import torch
 import shutil
 from typing import Union

-from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
+from transformers import GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
+try:
+    from hf_bleeding_edge import AutoModelForCausalLM
+except ImportError:
+    from transformers import AutoModelForCausalLM
+
+from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME

 import utils
 import modeling.lazy_loader as lazy_loader
@@ -25,6 +31,19 @@ class model_backend(HFTorchInferenceModel):
        super().__init__()
        self.use_4_bit = False

+    def is_valid(self, model_name, model_path, menu_path):
+        base_is_valid = super().is_valid(model_name, model_path, menu_path)
+        path = False
+        gen_path = "models/{}".format(model_name.replace('/', '_'))
+        if model_path is not None and os.path.exists(model_path):
+            path = model_path
+        elif os.path.exists(gen_path):
+            path = gen_path
+
+        fnames = [WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME]
+
+        return base_is_valid and any(os.path.exists(os.path.join(path, fname)) for fname in fnames)
+
    def _initialize_model(self):
        return

@@ -39,23 +58,24 @@ class model_backend(HFTorchInferenceModel):
                else:
                    temp = {}
                requested_parameters.append({
-                                            "uitype": "toggle",
-                                            "unit": "bool",
-                                            "label": "Use 4-bit",
-                                            "id": "use_4_bit",
-                                            "default": temp['use_4_bit'] if 'use_4_bit' in temp else False,
-                                            "tooltip": "Whether or not to use BnB's 4-bit mode",
+                                            "uitype": "dropdown",
+                                            "unit": "text",
+                                            "label": "Quantization",
+                                            "id": "quantization",
+                                            "default": temp['quantization'] if 'quantization' in temp else '4bit' if dependency_exists else '16-bit',
+                                            "tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode",
                                            "menu_path": "Layers",
+                                            "children": [{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}, {'text': '16-bit', 'value':'16-bit'}],
                                            "extra_classes": "",
                                            "refresh_model_inputs": False
                                        })
        else:
-            logger.warning("Bitsandbytes is not installed, you can not use Huggingface models in 4-bit")
+            logger.warning("Bitsandbytes is not installed, you can not use Quantization for Huggingface models")
        return requested_parameters
 
    def set_input_parameters(self, parameters):
        super().set_input_parameters(parameters)
-        self.use_4_bit = parameters['use_4_bit'] if 'use_4_bit' in parameters else False
+        self.quantization = parameters['quantization'] if 'quantization' in parameters else False

    def _load(self, save_model: bool, initial_load: bool) -> None:
        utils.koboldai_vars.allowsp = True
@@ -85,13 +105,22 @@ class model_backend(HFTorchInferenceModel):
            "low_cpu_mem_usage": True,
        }
        
-        if self.use_4_bit or utils.koboldai_vars.colab_arg:
+        if self.quantization == "8bit":
+            tf_kwargs.update({
+                "quantization_config":BitsAndBytesConfig(
+                    load_in_8bit=True,
+                    llm_int8_enable_fp32_cpu_offload=True
+                ),
+            })
+
+        if self.quantization == "4bit" or utils.koboldai_vars.colab_arg:
            tf_kwargs.update({
                "quantization_config":BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_compute_dtype=torch.float16,
                    bnb_4bit_use_double_quant=True,
-                    bnb_4bit_quant_type='nf4'
+                    bnb_4bit_quant_type='nf4',
+                    llm_int8_enable_fp32_cpu_offload=True
                ),
            })

@@ -99,10 +128,16 @@ class model_backend(HFTorchInferenceModel):
            # We must disable low_cpu_mem_usage and if using a GPT-2 model
            # because GPT-2 is not compatible with this feature yet.
            tf_kwargs.pop("low_cpu_mem_usage", None)
-
+            tf_kwargs.pop("quantization_config", None)
+            
            # Also, lazy loader doesn't support GPT-2 models
            self.lazy_load = False

+        if self.model_type == "llama":
+            tf_kwargs.update({
+                "pretraining_tp": 1 # Workaround recommended by HF to fix their mistake on the config.json tuners adopted
+            })
+        
        logger.debug(
            "lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(
                self.lazy_load,
@@ -299,7 +334,7 @@ class model_backend(HFTorchInferenceModel):
                    "disk_layers": self.disk_layers
                    if "disk_layers" in vars(self)
                    else 0,
-                    "use_4_bit": self.use_4_bit,
+                    "quantization": self.quantization,
                },
                f,
                indent="",