From 12df8220fb2d6122ee828c0910943a8e08c7ebb4 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 23 May 2023 06:59:28 +0200
Subject: [PATCH] Add gpt_bigcode support, fix 8-bit GPTQ incoherence

---
 docs/gptq-whl-links.html                         | 3 +++
 environments/huggingface.yml                     | 2 +-
 modeling/inference_models/gptq_hf_torch/class.py | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
index 34d05691..0808dbc6 100644
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -11,3 +11,6 @@
 
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl</a>
+
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-23/gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-23/gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl</a>
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index acba0648..79258b60 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -47,6 +47,6 @@ dependencies:
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai==0.0.4
+    - gptq_koboldai==0.0.5
     - einops
     - peft==0.3.0
diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 7d7dfc00..0cc1da8d 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -35,6 +35,7 @@ from gptq.gptj import load_quant as gptj_load_quant
 from gptq.gptneox import load_quant as gptneox_load_quant
 from gptq.llama import load_quant as llama_load_quant
 from gptq.opt import load_quant as opt_load_quant
+from gptq.bigcode import load_quant as bigcode_load_quant
 from gptq.mpt import load_quant as mpt_load_quant
 from gptq.offload import load_quant_offload
 
@@ -220,6 +221,8 @@ class model_backend(HFTorchInferenceModel):
             model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
         elif model_type == "mpt":
             model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+        elif model_type == "gpt_bigcode":
+            model = load_quant_offload(bigcode_load_quant, location, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list).half()
         elif autogptq_support:
             # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
             auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig