From a2d01bb9e454a0c951fc9c4c3e67599bcf188b5b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 9 May 2023 22:19:18 +0200
Subject: [PATCH] Update to GPTQ module 0.0.2, add support for upstream cuda
 quantizations, automatic detection

---
 docs/gptq-whl-links.html                   |  4 ++
 environments/huggingface.yml               |  2 +-
 environments/rocm.yml                      |  6 +--
 koboldai_settings.py                       |  1 +
 modeling/inference_models/hf.py            |  3 +-
 modeling/inference_models/hf_torch_4bit.py | 59 +++++++++++++---------
 6 files changed, 46 insertions(+), 29 deletions(-)
diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
index 08cd0cd7..64d15d3d 100644
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -1,3 +1,7 @@
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl</a>
+
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl</a>
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 12978b39..c381ea94 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -49,5 +49,5 @@ dependencies:
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai==0.0.1
+    - gptq_koboldai==0.0.2
     - einops
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 0cb44eb1..4f6cfa11 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -24,8 +24,8 @@ dependencies:
   - Pillow
   - psutil
   - pip:
-    - --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
-    - torch==2.0.0+rocm5.4.2
+    - --extra-index-url https://download.pytorch.org/whl/rocm5.2
+    - torch==1.13.1+rocm5.2
     - flask-cloudflared==0.0.10
     - flask-ngrok
     - flask-cors
@@ -44,5 +44,5 @@ dependencies:
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai_rocm==0.0.1
+    - gptq_koboldai_rocm==0.0.2
     - einops
diff --git a/koboldai_settings.py b/koboldai_settings.py
index 3e0fc48a..f0df2162 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -925,6 +925,7 @@ class story_settings(settings):
         self.gptq_model = False
         self.gptq_bits = -1
         self.gptq_groupsize = -1
+        self.gptq_version = -1
         self.gptq_file = None
 
         self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled"))
diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index 5ee2abaa..7050f34e 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -181,7 +181,8 @@ class HFInferenceModel(InferenceModel):
             if "gptq_bits" in dir(self.model_config):
                 utils.koboldai_vars.gptq_model = True
                 utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
-                utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize
+                utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
+                utils.koboldai_vars.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
                 utils.koboldai_vars.gptq_file = None
             else:
                 utils.koboldai_vars.gptq_model = False
diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 350cd761..5917a43e 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -48,8 +48,7 @@ def prepare_4bit_load(modelpath):
         return path_4bit, False
 
     # Legacy format support
-    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
-    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
+    paths_4bit = ["4bit*.safetensors", "4bit*.pt", "4bit-old.safetensors", "4bit-old.pt"]
     result = False
     groupsize = -1
     for p in paths_4bit:
@@ -59,26 +58,11 @@ def prepare_4bit_load(modelpath):
             result = val[0]
             fname = Path(result).parts[-1]
             g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+            groupsize = -1
             if g:
                 groupsize = int(g[0])
             break
 
-    if not result:
-        print("4-bit file not found, falling back to old format.")
-        for p in paths_4bit_old:
-            p = os.path.join(modelpath, p)
-            if os.path.isfile(p):
-                result = p
-                break
-
-        if not result:
-            print("4-bit old-format file not found, loading failed.")
-            raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
-
-        gptq.modelutils.set_gptq_version(0)
-    else:
-        gptq.modelutils.set_gptq_version(1)
-
     return result, groupsize
 
 
@@ -103,6 +87,7 @@ def load_model_gptq_settings():
         safetensors_file = os.path.join(utils.koboldai_vars.custmodpth, "model.safetensors")
         pt_file = os.path.join(utils.koboldai_vars.custmodpth, "model.ckpt")
         utils.koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file
+        utils.koboldai_vars.gptq_version = js.get("gptq_version", -1)
     elif gptq_legacy_files:
         utils.koboldai_vars.gptq_model = True
         utils.koboldai_vars.gptq_bits = 4
@@ -110,10 +95,37 @@ def load_model_gptq_settings():
         fname = Path(utils.koboldai_vars.gptq_file).parts[-1]
         g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
         utils.koboldai_vars.gptq_groupsize = int(g[0]) if g else -1
+        utils.koboldai_vars.gptq_version = -1
     else:
         utils.koboldai_vars.gptq_model = False
 
 
+def get_gptq_version(fpath):
+    v1_strings = ["zeros", "scales", "bias", "qweight"]
+    v2_strings = ["qzeros", "scales", "bias", "qweight"]
+    v3_strings = ["qzeros", "scales", "g_idx", "qweight"]
+
+    with open(fpath, "rb") as f:
+        data = str(f.read(1024*1024))
+
+    v0 = all([s in data for s in v1_strings]) and not "qzeros" in data
+    v1 = all([s in data for s in v2_strings])
+    v2 = all([s in data for s in v3_strings])
+
+    if v2:
+        if v0 or v1:
+            logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}")
+        return 2
+    if v1:
+        if v0 or v2:
+            logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}")
+        return 1
+    if v0:
+        if v1 or v2:
+            logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
+        return 0
+
+
 class HFTorch4BitInferenceModel(HFTorchInferenceModel):
     def _load(self, save_model: bool, initial_load: bool) -> None:
         utils.koboldai_vars.allowsp = True
@@ -140,9 +152,6 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
         except ValueError:
             self.gpu_layers_list = [utils.num_layers(self.model_config)]
 
-        if sum(self.gpu_layers_list) < utils.num_layers(self.model_config):
-            print("4-bit CPU offloader active")
-
         tf_kwargs = {
             "low_cpu_mem_usage": True,
         }
@@ -351,12 +360,14 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
 
         path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
 
+        if utils.koboldai_vars.gptq_version < 0:
+            utils.koboldai_vars.gptq_version = get_gptq_version(path_4bit)
+        gptq.modelutils.set_gptq_version(utils.koboldai_vars.gptq_version)
+
         if legacy_groupsize is not False:
             groupsize = legacy_groupsize
 
-        print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
-
-        print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")
+        logger.info(f"Using 4-bit file: {path_4bit}, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}")
         if utils.koboldai_vars.model_type == "gptj":
             model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
         elif utils.koboldai_vars.model_type == "gpt_neox":