Merge pull request #367 from 0cc4m/4bit-plugin

GPTQ module
2025-06-05 21:59:24 +02:00 · 2023-07-23 22:32:20 +02:00
parent e33a58b74a 73953068c0
commit 1facc73b66
10 changed files with 344 additions and 8 deletions
--- a/README_GPTQ.md
+++ b/README_GPTQ.md
@@ -0,0 +1,50 @@
+### Install/Use Guide
+(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
+
+#### Installation
+In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
+
+Note: do not run your command prompt as administrator/with elevated priviledges, reports suggest this leads to problems.
+
+`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
+
+`cd KoboldAI`
+
+Next step, (Windows) subfolder mode or B: option doesn't matter choose either
+
+* [if on Windows]
+  ```
+  install_requirements.bat
+  ```
+  * if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
+
+* [if on Linux with Nvidia] 
+  ```
+  ./install_requirements.sh
+  ```
+* [if on Linux with AMD]
+  ```
+  ./install_requirements.sh rocm
+  ./commandline-rocm.sh
+  pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa@c884b421a233f9603d8224c9b22c2d83dd2c1fc4
+  ```
+  * If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed
+  * If you get CUDA_HOME envar is not set run in env: 
+    `pip3 install torch --index-url https://download.pytorch.org/whl/rocm5.4.2 --force-reinstall`
+
+#### Setting up models
+If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
+
+Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
+
+Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
+
+So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, it'll load the quantized model.
+
+#### Running KoboldAI and loading 4bit models
+If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
+
+Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD]
+
+Switch to UI2, then load your model.
+
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -46,5 +46,11 @@ dependencies:
    - ftfy
    - pydub
    - diffusers
+    - git+https://github.com/0cc4m/hf_bleeding_edge/
+    - --find-links=https://0cc4m.github.io/GPTQ-for-LLaMa/gptq-whl-links.html
+    - gptq_koboldai==0.0.6
+    - einops
    - peft==0.3.0
    - scipy
+    - --find-links=https://0cc4m.github.io/exllama/exllama-whl-links.html
+    - exllama==0.0.6
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -41,4 +41,6 @@ dependencies:
    - ftfy
    - pydub
    - diffusers
+    - git+https://github.com/0cc4m/hf_bleeding_edge/
+    - einops
    - peft==0.3.0
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -6,7 +6,13 @@ import torch
 import shutil
 from typing import Union

-from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
+from transformers import GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
+try:
+    from hf_bleeding_edge import AutoModelForCausalLM
+except ImportError:
+    from transformers import AutoModelForCausalLM
+
+from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME

 import utils
 import modeling.lazy_loader as lazy_loader
@@ -21,7 +27,19 @@ model_backend_name = "Huggingface"
 model_backend_type = "Huggingface" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)

 class model_backend(HFTorchInferenceModel):
-        
+    def is_valid(self, model_name, model_path, menu_path):
+        base_is_valid = super().is_valid(model_name, model_path, menu_path)
+        path = False
+        gen_path = "models/{}".format(model_name.replace('/', '_'))
+        if model_path is not None and os.path.exists(model_path):
+            path = model_path
+        elif os.path.exists(gen_path):
+            path = gen_path
+
+        fnames = [WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME]
+
+        return base_is_valid and any(os.path.exists(os.path.join(path, fname)) for fname in fnames)
+
    def _initialize_model(self):
        return

--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -0,0 +1,240 @@
+from __future__ import annotations
+
+import os
+import glob
+import json
+import torch
+import re
+import shutil
+import sys
+from typing import Union
+
+import utils
+import modeling.lazy_loader as lazy_loader
+import koboldai_settings
+from logger import logger, set_logger_verbosity
+
+from modeling.inference_models.hf_torch import HFTorchInferenceModel
+from modeling.tokenizer import GenericTokenizer
+
+from pathlib import Path
+
+
+model_backend_type = "GPTQ"
+model_backend_name = "Huggingface GPTQ"
+
+
+def load_model_gptq_settings(path):
+    try:
+        js = json.load(open(path + "/config.json", "r"))
+    except Exception as e:
+        return False, -1, -1, False, -1
+
+    gptq_model = False
+    gptq_bits = -1
+    gptq_groupsize = -1
+    gptq_file = False
+    gptq_version = -1
+
+    gptq_legacy_files = glob.glob(os.path.join(path, "*4bit*.pt")) + glob.glob(os.path.join(path, "*4bit*.safetensors"))
+    if "gptq_bits" in js:
+        gptq_model = True
+        gptq_bits = js["gptq_bits"]
+        gptq_groupsize = js.get("gptq_groupsize", -1)
+        safetensors_file = os.path.join(path, "model.safetensors")
+        pt_file = os.path.join(path, "model.ckpt")
+        gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file
+        gptq_version = js.get("gptq_version", -1)
+    elif gptq_legacy_files:
+        gptq_model = True
+        gptq_bits = 4
+        gptq_file = gptq_legacy_files[0]
+        fname = Path(gptq_file).parts[-1]
+        g = re.findall("(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+        gptq_groupsize = int(g[0]) if g else -1
+        gptq_version = -1
+
+    return gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version
+
+
+def get_gptq_version(fpath):
+    v1_strings = ["zeros", "scales", "bias", "qweight"]
+    v2_strings = ["qzeros", "scales", "bias", "qweight"]
+    v3_strings = ["qzeros", "scales", "g_idx", "qweight"]
+
+    with open(fpath, "rb") as f:
+        data = str(f.read(1024*1024))
+
+    v0 = all([s in data for s in v1_strings]) and not "qzeros" in data
+    v1 = all([s in data for s in v2_strings])
+    v2 = all([s in data for s in v3_strings])
+
+    if v2:
+        if v0:
+            logger.warning(f"GPTQ model identified as v2, but v0={v0}")
+        return 2, v1
+    if v1:
+        if v0 or v2:
+            logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}")
+        return 1, False
+    if v0:
+        if v1 or v2:
+            logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
+        return 0, False
+
+
+class model_backend(HFTorchInferenceModel):
+    def is_valid(self, model_name, model_path, menu_path):
+        gptq_model, _, _, _, _ = load_model_gptq_settings(model_path)
+        return bool(gptq_model)
+
+    def _load(self, save_model: bool, initial_load: bool) -> None:
+        # Make model path the same as the model name to make this consistent
+        # with the other loading method if it isn't a known model type. This
+        # code is not just a workaround for below, it is also used to make the
+        # behavior consistent with other loading methods - Henk717
+        # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]:
+        #     utils.koboldai_vars.custmodpth = utils.koboldai_vars.model
+
+        self.init_model_config()
+
+        self.lazy_load = False
+
+        gpulayers = self.breakmodel_config.gpu_blocks
+
+        try:
+            self.gpu_layers_list = [int(l) for l in gpulayers.split(",")]
+        except (ValueError, AttributeError):
+            self.gpu_layers_list = [utils.num_layers(self.model_config)]
+
+        tf_kwargs = {
+            "low_cpu_mem_usage": True,
+        }
+
+        # If we're using torch_lazy_loader, we need to get breakmodel config
+        # early so that it knows where to load the individual model tensors
+        logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
+        if (
+            self.lazy_load
+            and utils.koboldai_vars.hascuda
+            and utils.koboldai_vars.breakmodel
+            and not utils.koboldai_vars.nobreakmodel
+        ):
+            self.breakmodel_device_config(self.model_config)
+
+        if self.lazy_load:
+            # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
+            tf_kwargs.pop("low_cpu_mem_usage", None)
+
+            # If we're using lazy loader, we need to figure out what the model's hidden layers are called
+            with lazy_loader.use_lazy_load(dematerialized_modules=True):
+                try:
+                    metamodel = AutoModelForCausalLM.from_config(self.model_config)
+                    utils.layers_module_names = utils.get_layers_module_names(metamodel)
+                    utils.module_names = list(metamodel.state_dict().keys())
+                    utils.named_buffers = list(metamodel.named_buffers(recurse=True))
+                except Exception as e:
+                    if utils.args.panic:
+                        raise e
+                    logger.warning(f"Gave up on lazy loading due to {e}")
+                    self.lazy_load = False
+
+        if self.get_local_model_path():
+            # Model is stored locally, load it.
+            self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
+            self.tokenizer = self._get_tokenizer(self.get_local_model_path())
+        else:
+            raise NotImplementedError("GPTQ Model downloading not implemented")
+
+        if (
+            utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default
+            and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj")
+        ):
+            utils.koboldai_vars.badwordsids = [
+                [v]
+                for k, v in self.tokenizer.get_vocab().items()
+                if any(c in str(k) for c in "[]")
+            ]
+
+        self.patch_embedding()
+
+        self.model.kai_model = self
+        utils.koboldai_vars.modeldim = self.get_hidden_size()
+
+    def _get_model(self, location: str, tf_kwargs: Dict):
+        import gptq
+        from gptq.gptj import load_quant as gptj_load_quant
+        from gptq.gptneox import load_quant as gptneox_load_quant
+        from gptq.llama import load_quant as llama_load_quant
+        from gptq.opt import load_quant as opt_load_quant
+        from gptq.bigcode import load_quant as bigcode_load_quant
+        from gptq.mpt import load_quant as mpt_load_quant
+        from gptq.offload import load_quant_offload
+
+        gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
+        v2_bias = False
+
+        if gptq_version < 0:
+            gptq_version, v2_bias = get_gptq_version(gptq_file)
+        gptq.modelutils.set_gptq_version(gptq_version)
+
+        model_type = self.get_model_type()
+
+        logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}")
+        if model_type == "gptj":
+            model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+        elif model_type == "gpt_neox":
+            model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+        elif model_type == "llama":
+            model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+        elif model_type == "opt":
+            model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+        elif model_type == "mpt":
+            model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+        elif model_type == "gpt_bigcode":
+            model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
+        else:
+            try:
+                import auto_gptq
+                from auto_gptq import AutoGPTQForCausalLM
+            except ImportError:
+                raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
+
+            try:
+                import hf_bleeding_edge
+                from hf_bleeding_edge import AutoModelForCausalLM
+            except ImportError:
+                from transformers import AutoModelForCausalLM
+
+            # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
+            auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
+            auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
+            auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM
+            model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"))
+
+            # Patch in embeddings function
+            def get_input_embeddings(self):
+                return self.model.get_input_embeddings()
+
+            type(model).get_input_embeddings = get_input_embeddings
+
+            # Patch in args support..
+            def generate(self, *args, **kwargs):
+                """shortcut for model.generate"""
+                with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
+                    return self.model.generate(*args, **kwargs)
+
+            type(model).generate = generate
+
+        return model
+
+    def _get_tokenizer(self, location: str):
+        from transformers import AutoTokenizer, LlamaTokenizer
+
+        model_type = self.get_model_type()
+        if model_type == "llama":
+            tokenizer = LlamaTokenizer.from_pretrained(location)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(location)
+
+        return GenericTokenizer(tokenizer)
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -1,6 +1,10 @@
 import os, sys
 from typing import Optional
-from transformers import AutoConfig
+try:
+    from hf_bleeding_edge import AutoConfig
+except ImportError:
+    from transformers import AutoConfig
+
 import warnings
 import utils
 import json
@@ -383,7 +387,17 @@ class HFInferenceModel(InferenceModel):
                revision=utils.koboldai_vars.revision,
                cache_dir="cache",
            )
+
            self.model_type = self.model_config.model_type
+
+            if "gptq_bits" in dir(self.model_config):
+                self.gptq_model = True
+                self.gptq_bits = self.model_config.gptq_bits
+                self.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
+                self.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
+                self.gptq_file = None
+            else:
+                self.gptq_model = False
        except ValueError:
            self.model_type = {
                "NeoCustom": "gpt_neo",
@@ -394,4 +408,4 @@ class HFInferenceModel(InferenceModel):
                logger.warning(
                    "No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)"
                )
-                self.model_type = "gpt_neo"
+                self.model_type = "gpt_neo"
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -17,9 +17,12 @@ from transformers import (
    StoppingCriteria,
    GPTNeoForCausalLM,
    GPT2LMHeadModel,
-    AutoModelForCausalLM,
    LogitsProcessorList,
 )
+try:
+    from hf_bleeding_edge import AutoModelForCausalLM
+except ImportError:
+    from transformers import AutoModelForCausalLM

 import utils
 import modeling.lazy_loader as lazy_loader
--- a/requirements.txt
+++ b/requirements.txt
@@ -38,5 +38,8 @@ pytest-html==3.2.0
 pytest-metadata==2.0.4
 requests-mock==1.10.0
 safetensors==0.3.1
+git+https://github.com/0cc4m/hf_bleeding_edge/
+--find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html gptq_koboldai==0.0.4
+einops
 peft==0.3.0
 scipy
--- a/templates/popups.html
+++ b/templates/popups.html
@@ -393,4 +393,4 @@
 	
 </div>

-<div id="notification-container"></div>
+<div id="notification-container"></div>
--- a/utils.py
+++ b/utils.py
@@ -184,7 +184,7 @@ def decodenewlines(txt):
 #  Returns number of layers given an HF model config
 #==================================================================#
 def num_layers(config):
-    return config["n_layer"] if isinstance(config, dict) else config.num_layers if hasattr(config, "num_layers") else config.n_layer if hasattr(config, "n_layer") else config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else None
+    return config["n_layer"] if isinstance(config, dict) else config.num_layers if hasattr(config, "num_layers") else config.n_layer if hasattr(config, "n_layer") else config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else config.n_layers if hasattr(config, "n_layers") else None

 #==================================================================#
 #  Downloads huggingface checkpoints using aria2c if possible
@@ -703,7 +703,7 @@ def applyoutputformatting(txt, no_sentence_trimming=False, no_single_line=False)
        txt = replaceblanklines(txt)

    # trim off starting new lines in replies if we're in chat mode
-    if koboldai_vars.chatmode and txt[0] == "\n":
+    if koboldai_vars.chatmode and txt and txt[0] == "\n":
        txt = txt[1:]

    # Remove special characters