Update to GPTQ module 0.0.2, add support for upstream cuda quantizations, automatic detection

2025-06-05 21:59:24 +02:00 · 2023-05-09 22:19:18 +02:00
parent 6121598142
commit a2d01bb9e4
6 changed files with 46 additions and 29 deletions
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -1,3 +1,7 @@
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl</a>
+
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl</a>
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -49,5 +49,5 @@ dependencies:
    - diffusers
    - git+https://github.com/0cc4m/hf_bleeding_edge/
    - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai==0.0.1
+    - gptq_koboldai==0.0.2
    - einops
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -24,8 +24,8 @@ dependencies:
  - Pillow
  - psutil
  - pip:
-    - --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
-    - torch==2.0.0+rocm5.4.2
+    - --extra-index-url https://download.pytorch.org/whl/rocm5.2
+    - torch==1.13.1+rocm5.2
    - flask-cloudflared==0.0.10
    - flask-ngrok
    - flask-cors
@@ -44,5 +44,5 @@ dependencies:
    - diffusers
    - git+https://github.com/0cc4m/hf_bleeding_edge/
    - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai_rocm==0.0.1
+    - gptq_koboldai_rocm==0.0.2
    - einops
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -925,6 +925,7 @@ class story_settings(settings):
        self.gptq_model = False
        self.gptq_bits = -1
        self.gptq_groupsize = -1
+        self.gptq_version = -1
        self.gptq_file = None

        self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled"))
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -181,7 +181,8 @@ class HFInferenceModel(InferenceModel):
            if "gptq_bits" in dir(self.model_config):
                utils.koboldai_vars.gptq_model = True
                utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
-                utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize
+                utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
+                utils.koboldai_vars.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
                utils.koboldai_vars.gptq_file = None
            else:
                utils.koboldai_vars.gptq_model = False
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -48,8 +48,7 @@ def prepare_4bit_load(modelpath):
        return path_4bit, False

    # Legacy format support
-    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
-    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
+    paths_4bit = ["4bit*.safetensors", "4bit*.pt", "4bit-old.safetensors", "4bit-old.pt"]
    result = False
    groupsize = -1
    for p in paths_4bit:
@@ -59,26 +58,11 @@ def prepare_4bit_load(modelpath):
            result = val[0]
            fname = Path(result).parts[-1]
            g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+            groupsize = -1
            if g:
                groupsize = int(g[0])
            break

-    if not result:
-        print("4-bit file not found, falling back to old format.")
-        for p in paths_4bit_old:
-            p = os.path.join(modelpath, p)
-            if os.path.isfile(p):
-                result = p
-                break
-
-        if not result:
-            print("4-bit old-format file not found, loading failed.")
-            raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
-
-        gptq.modelutils.set_gptq_version(0)
-    else:
-        gptq.modelutils.set_gptq_version(1)
-
    return result, groupsize


@@ -103,6 +87,7 @@ def load_model_gptq_settings():
        safetensors_file = os.path.join(utils.koboldai_vars.custmodpth, "model.safetensors")
        pt_file = os.path.join(utils.koboldai_vars.custmodpth, "model.ckpt")
        utils.koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file
+        utils.koboldai_vars.gptq_version = js.get("gptq_version", -1)
    elif gptq_legacy_files:
        utils.koboldai_vars.gptq_model = True
        utils.koboldai_vars.gptq_bits = 4
@@ -110,10 +95,37 @@ def load_model_gptq_settings():
        fname = Path(utils.koboldai_vars.gptq_file).parts[-1]
        g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
        utils.koboldai_vars.gptq_groupsize = int(g[0]) if g else -1
+        utils.koboldai_vars.gptq_version = -1
    else:
        utils.koboldai_vars.gptq_model = False


+def get_gptq_version(fpath):
+    v1_strings = ["zeros", "scales", "bias", "qweight"]
+    v2_strings = ["qzeros", "scales", "bias", "qweight"]
+    v3_strings = ["qzeros", "scales", "g_idx", "qweight"]
+
+    with open(fpath, "rb") as f:
+        data = str(f.read(1024*1024))
+
+    v0 = all([s in data for s in v1_strings]) and not "qzeros" in data
+    v1 = all([s in data for s in v2_strings])
+    v2 = all([s in data for s in v3_strings])
+
+    if v2:
+        if v0 or v1:
+            logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}")
+        return 2
+    if v1:
+        if v0 or v2:
+            logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}")
+        return 1
+    if v0:
+        if v1 or v2:
+            logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
+        return 0
+
+
 class HFTorch4BitInferenceModel(HFTorchInferenceModel):
    def _load(self, save_model: bool, initial_load: bool) -> None:
        utils.koboldai_vars.allowsp = True
@@ -140,9 +152,6 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
        except ValueError:
            self.gpu_layers_list = [utils.num_layers(self.model_config)]

-        if sum(self.gpu_layers_list) < utils.num_layers(self.model_config):
-            print("4-bit CPU offloader active")
-
        tf_kwargs = {
            "low_cpu_mem_usage": True,
        }
@@ -351,12 +360,14 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):

        path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)

+        if utils.koboldai_vars.gptq_version < 0:
+            utils.koboldai_vars.gptq_version = get_gptq_version(path_4bit)
+        gptq.modelutils.set_gptq_version(utils.koboldai_vars.gptq_version)
+
        if legacy_groupsize is not False:
            groupsize = legacy_groupsize

-        print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
-
-        print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")
+        logger.info(f"Using 4-bit file: {path_4bit}, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}")
        if utils.koboldai_vars.model_type == "gptj":
            model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
        elif utils.koboldai_vars.model_type == "gpt_neox":