From 8c9ed5540626655870b6c8e79b5a838f6f012a91 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Thu, 9 Mar 2023 22:36:45 -0500
Subject: [PATCH 001/113] Update aiserver.py

---
 aiserver.py | 63 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 13 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 7c60b04e..4174d1fa 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -87,6 +87,38 @@ from io import BytesIO
 
 global tpu_mtj_backend
 
+from transformers.models.llama.tokenization_llama import LLaMATokenizer
+from repos.gptq.gptq import *
+from repos.gptq.modelutils import *
+from repos.gptq.quant import *
+def load_quant(model, checkpoint, wbits):
+    from transformers import LLaMAConfig, LLaMAForCausalLM 
+    config = LLaMAConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop 
+    torch.nn.init.uniform_ = noop 
+    torch.nn.init.normal_ = noop 
+
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = LLaMAForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    make_quant(model, layers, wbits)
+
+    print('Loading model ...')
+    model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = 2048
+    print('Done.')
+
+    return model
+
 
 if lupa.LUA_VERSION[:2] != (5, 4):
     logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
@@ -2886,7 +2918,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                     @functools.lru_cache(maxsize=None)
                     def get_original_key(key):
-                        return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
+                        try:
+                            return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
+                        except ValueError:
+                            return key
 
                     for key, value in model_dict.items():
                         original_key = get_original_key(key)
@@ -3083,22 +3118,24 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(koboldai_vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                         lowmem = {}
                     if(os.path.isdir(koboldai_vars.custmodpth)):
+                        tokenizer = LLaMATokenizer.from_pretrained(koboldai_vars.custmodpth)
+                        # try:
+                        #     tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
+                        # except Exception as e:
+                        #     try:
+                        #         tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
+                        #     except Exception as e:
+                        #         try:
+                        #             tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
+                        #         except Exception as e:
+                        #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
                         try:
-                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
-                        except Exception as e:
-                            try:
-                                tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
-                            except Exception as e:
-                                try:
-                                    tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
-                                except Exception as e:
-                                    tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
-                        try:
-                            model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                            # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4)
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
-                            model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                            # model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
                     elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)

From dcf9d37a00dc582618f10deef6d226f77018dc16 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Fri, 10 Mar 2023 00:01:40 -0500
Subject: [PATCH 002/113] It just works.

---
 aiserver.py | 46 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 4174d1fa..66aa7362 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1142,9 +1142,9 @@ def move_model_to_devices(model):
 
     if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel):
         if(koboldai_vars.usegpu):
-            model = model.half().to(koboldai_vars.gpu_device)
+            model = model.to(koboldai_vars.gpu_device)
         else:
-            model = model.to('cpu').float()
+            model = model.to('cpu')
         generator = model.generate
         return
 
@@ -1172,7 +1172,6 @@ def move_model_to_devices(model):
         generator = model.generate
         return
 
-    model.half()
     gc.collect()
 
     if(hasattr(model, "transformer")):
@@ -2983,10 +2982,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
                                 #print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
                                 model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
-                                if model_dict[key].dtype is torch.float32:
-                                    koboldai_vars.fp32_model = True
-                                if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
-                                    model_dict[key] = model_dict[key].to(torch.float16)
+                                # if model_dict[key].dtype is torch.float32:
+                                #     koboldai_vars.fp32_model = True
+                                # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
+                                #     model_dict[key] = model_dict[key].to(torch.float16)
                                 if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
                                     model_dict[key] = model_dict[key].to(torch.float32)
                                 if device == "shared":
@@ -3010,16 +3009,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 if utils.offload_index:
                                     for name, tensor in utils.named_buffers:
                                         dtype = tensor.dtype
-                                        if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
-                                            dtype = torch.float16
-                                        if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
-                                            dtype = torch.float32
-                                        if name in model_dict and model_dict[name].dtype is not dtype:
-                                            model_dict[name] = model_dict[name].to(dtype)
-                                        if tensor.dtype is not dtype:
-                                            tensor = tensor.to(dtype)
-                                        if name not in utils.offload_index:
-                                            accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
+                                        # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
+                                        #     dtype = torch.float16
+                                        # if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
+                                        #     dtype = torch.float32
+                                        # if name in model_dict and model_dict[name].dtype is not dtype:
+                                        #     model_dict[name] = model_dict[name].to(dtype)
+                                        # if tensor.dtype is not dtype:
+                                        #     tensor = tensor.to(dtype)
+                                        # if name not in utils.offload_index:
+                                        #     accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
                                     accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
                                 utils.bar.close()
                                 utils.bar = None
@@ -3078,10 +3077,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 koboldai_vars.modeldim = get_hidden_size_from_model(model)
                 # Is CUDA available? If so, use GPU, otherwise fall back to CPU
                 if(koboldai_vars.hascuda and koboldai_vars.usegpu):
-                    model = model.half().to(koboldai_vars.gpu_device)
+                    model = model.to(koboldai_vars.gpu_device)
                     generator = model.generate
                 else:
-                    model = model.to('cpu').float()
+                    model = model.to('cpu')
                     generator = model.generate
                 patch_causallm(model)
             # Use the Generic implementation
@@ -3131,7 +3130,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
                         try:
                             # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4)
+                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
@@ -3190,7 +3189,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             import shutil
                             tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
                             if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)):  # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case
-                                model = model.half()
                                 model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
                             else:  # For fp16 models, we can just copy the model files directly
                                 import transformers.configuration_utils
@@ -3224,7 +3222,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 if(koboldai_vars.hascuda):
                     if(koboldai_vars.usegpu):
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
-                        model = model.half().to(koboldai_vars.gpu_device)
+                        model = model.to(koboldai_vars.gpu_device)
                         generator = model.generate
                     elif(koboldai_vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
@@ -3236,7 +3234,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         generator = model.generate
                     else:
-                        model = model.to('cpu').float()
+                        model = model.to('cpu')
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         generator = model.generate
                 elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
@@ -3244,7 +3242,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     koboldai_vars.modeldim = get_hidden_size_from_model(model)
                     generator = model.generate
                 else:
-                    model.to('cpu').float()
+                    model.to('cpu')
                     koboldai_vars.modeldim = get_hidden_size_from_model(model)
                     generator = model.generate
             

From 3f132ce45ba61f30015147bb0d9ba26647204332 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Fri, 10 Mar 2023 03:26:09 -0500
Subject: [PATCH 003/113] Notify if LLAMA_4BIT env var not set

---
 aiserver.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 66aa7362..399ce434 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3130,7 +3130,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
                         try:
                             # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
+                            if os.environ.get('LLAMA_4BIT') is not None:
+                                model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
+                            else:
+                                raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before procedding.")
+                                exit(1)
+
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")

From 1808b0d2eca42e30bee6edd6896744cfd6995ffc Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Sat, 11 Mar 2023 12:13:22 -0500
Subject: [PATCH 004/113] Another safety check for if model is not loaded

---
 aiserver.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 399ce434..3ec8f284 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3133,13 +3133,17 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             if os.environ.get('LLAMA_4BIT') is not None:
                                 model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
                             else:
-                                raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before procedding.")
+                                raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
                                 exit(1)
 
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
                             # model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+			
+			if model is None:
+				raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load. Exiting.")
+				exit(1)
                     elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)

From bde31217f164a3aadc4282913012378a886d6058 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Sat, 11 Mar 2023 12:15:58 -0500
Subject: [PATCH 005/113] improve model None check

---
 aiserver.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 3ec8f284..c14ac730 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3134,16 +3134,14 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
                             else:
                                 raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
-                                exit(1)
+
+                            if model is None:
+                                raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")
 
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
                             # model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-			
-			if model is None:
-				raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load. Exiting.")
-				exit(1)
                     elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)

From b3b454bbe4b4a479ec5703b99487bf00906975ac Mon Sep 17 00:00:00 2001
From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Date: Wed, 15 Mar 2023 00:03:43 -0500
Subject: [PATCH 006/113] Update huggingface.yml

---
 environments/huggingface.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 222bb6ad..26e7e670 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -29,7 +29,8 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - transformers==4.25.1
+    - git+https://github.com/zphang/transformers@llama_push
+    - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
     - huggingface_hub==0.12.1
     - safetensors
     - accelerate

From 5d17692c79a3642b7e1ae1c37e262cd47f449356 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Thu, 16 Mar 2023 05:19:47 +0000
Subject: [PATCH 007/113] Remove except Exception so that errors actually show
 up

---
 aiserver.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 77e31b63..40d9a4ba 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3128,20 +3128,15 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         #             tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
                         #         except Exception as e:
                         #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
-                        try:
-                            # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-                            if os.environ.get('LLAMA_4BIT') is not None:
-                                model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
-                            else:
-                                raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
+                        # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
 
-                            if model is None:
-                                raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")
+                        if os.environ.get('LLAMA_4BIT'):
+                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
+                        else:
+                            raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
 
-                        except Exception as e:
-                            if("out of memory" in traceback.format_exc().lower()):
-                                raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
-                            # model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                        if model is None:
+                            raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")
                     elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)

From 60acf593160ce86118286ab0fa5c4ce082ddc52c Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 19 Mar 2023 21:19:02 +0000
Subject: [PATCH 008/113] Improve 4-bit llama support, add 4-bit gptj and
 gptneox support

---
 aiserver.py | 86 +++++++++++++++++++++++++----------------------------
 1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 40d9a4ba..96ea7490 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -87,37 +87,14 @@ from io import BytesIO
 
 global tpu_mtj_backend
 
-from transformers.models.llama.tokenization_llama import LLaMATokenizer
-from repos.gptq.gptq import *
-from repos.gptq.modelutils import *
-from repos.gptq.quant import *
-def load_quant(model, checkpoint, wbits):
-    from transformers import LLaMAConfig, LLaMAForCausalLM 
-    config = LLaMAConfig.from_pretrained(model)
-    def noop(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = noop 
-    torch.nn.init.uniform_ = noop 
-    torch.nn.init.normal_ = noop 
 
-    torch.set_default_dtype(torch.half)
-    transformers.modeling_utils._init_weights = False
-    torch.set_default_dtype(torch.half)
-    model = LLaMAForCausalLM(config)
-    torch.set_default_dtype(torch.float)
-    model = model.eval()
-    layers = find_layers(model)
-    for name in ['lm_head']:
-        if name in layers:
-            del layers[name]
-    make_quant(model, layers, wbits)
-
-    print('Loading model ...')
-    model.load_state_dict(torch.load(checkpoint))
-    model.seqlen = 2048
-    print('Done.')
-
-    return model
+# 4-bit dependencies
+from pathlib import Path
+sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
+from gptj import load_quant as gptj_load_quant
+from gptneox import load_quant as gptneox_load_quant
+from llama import load_quant as llama_load_quant
+vars_4bit = {}
 
 
 if lupa.LUA_VERSION[:2] != (5, 4):
@@ -1541,6 +1518,11 @@ def general_startup(override_args=None):
     parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen")
     parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen")
 
+    # 4-bit stuff
+    parser.add_argument('--gptj4bit', help="Load a GPT-J model 4-bit pt file with this path")
+    parser.add_argument('--gptneox4bit', help="Load a GPT-NeoX model 4-bit pt file with this path")
+    parser.add_argument('--llama4bit', help="Load a Llama model 4-bit pt file with this path")
+
     #args: argparse.Namespace = None
     if "pytest" in sys.modules and override_args is None:
         args = parser.parse_args([])
@@ -1644,6 +1626,11 @@ def general_startup(override_args=None):
     koboldai_vars.smanrename = koboldai_vars.host == args.override_rename
 
     koboldai_vars.aria2_port = args.aria2_port or 6799
+
+    global vars_4bit
+    vars_4bit["gptj4bit"] = args.gptj4bit
+    vars_4bit["gptneox4bit"] = args.gptneox4bit
+    vars_4bit["llama4bit"] = args.llama4bit
     
     #Now let's look to see if we are going to force a load of a model from a user selected folder
     if(koboldai_vars.model == "selectfolder"):
@@ -2971,7 +2958,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                     try:
                                         f = z.open(f"archive/data/{storage_key}")
                                     except:
-                                        f = z.open(f"{zipfolder}/data/{storage_key}")
+                                        ziproot = z.namelist()[0].split(os.sep)[0]
+                                        f = z.open(f"{ziproot}/data/{storage_key}")
                                     current_offset = 0
                                 if current_offset != model_dict[key].seek_offset:
                                     f.read(model_dict[key].seek_offset - current_offset)
@@ -3117,23 +3105,29 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(koboldai_vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                         lowmem = {}
                     if(os.path.isdir(koboldai_vars.custmodpth)):
-                        tokenizer = LLaMATokenizer.from_pretrained(koboldai_vars.custmodpth)
-                        # try:
-                        #     tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
-                        # except Exception as e:
-                        #     try:
-                        #         tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
-                        #     except Exception as e:
-                        #         try:
-                        #             tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
-                        #         except Exception as e:
-                        #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
-                        # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                        global vars_4bit
 
-                        if os.environ.get('LLAMA_4BIT'):
-                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
+                        if vars_4bit.get("gptj4bit"):
+                            model = gptj_load_quant(koboldai_vars.custmodpth, vars_4bit["gptj4bit"], 4)
+                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                        elif vars_4bit.get("gptneox4bit"):
+                            model = gptneox_load_quant(koboldai_vars.custmodpth, vars_4bit["gptneox4bit"], 4)
+                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                        elif vars_4bit.get("llama4bit"):
+                            model = llama_load_quant(koboldai_vars.custmodpth, vars_4bit["llama4bit"], 4)
+                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                         else:
-                            raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
+                            try:
+                                tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
+                            except Exception as e:
+                                try:
+                                    tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
+                                except Exception as e:
+                                    try:
+                                        tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
+                                    except Exception as e:
+                                        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
+                            model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
 
                         if model is None:
                             raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")

From 858657f6691933ad3660660001837491b7ba4ae6 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 20 Mar 2023 09:16:30 +0100
Subject: [PATCH 009/113] Fix zipfile folder identification fix for Windows

---
 aiserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 96ea7490..4558ce3d 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2958,7 +2958,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                     try:
                                         f = z.open(f"archive/data/{storage_key}")
                                     except:
-                                        ziproot = z.namelist()[0].split(os.sep)[0]
+                                        ziproot = z.namelist()[0].split("/")[0]
                                         f = z.open(f"{ziproot}/data/{storage_key}")
                                     current_offset = 0
                                 if current_offset != model_dict[key].seek_offset:

From 4cfc1219d449ebc92205eed15f0ffc1b133db708 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 20 Mar 2023 19:13:46 +0000
Subject: [PATCH 010/113] Add gptq as submodule

---
 .gitmodules | 4 ++++
 repos/gptq  | 1 +
 2 files changed, 5 insertions(+)
 create mode 160000 repos/gptq

diff --git a/.gitmodules b/.gitmodules
index 0107a8c3..c6f4b308 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,7 @@
 [submodule "KoboldAI-Horde-Bridge"]
 	path = KoboldAI-Horde-Bridge
 	url = https://github.com/db0/KoboldAI-Horde-Bridge
+[submodule "repos/gptq"]
+	path = repos/gptq
+	url = https://github.com/0cc4m/GPTQ-for-LLaMa
+	branch = a8303654c200c25577130466e5f9bc1e70fc8a50
diff --git a/repos/gptq b/repos/gptq
new file mode 160000
index 00000000..a8303654
--- /dev/null
+++ b/repos/gptq
@@ -0,0 +1 @@
+Subproject commit a8303654c200c25577130466e5f9bc1e70fc8a50

From ecd065a881d40996558ff07d0e2bfdbdf255e777 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 21 Mar 2023 21:40:59 +0000
Subject: [PATCH 011/113] Overhaul 4-bit support to load with a toggle

---
 aiserver.py           | 145 +++++++++++++++++++++++++++---------------
 koboldai_settings.py  |   6 +-
 static/koboldai.js    |  32 +++++++++-
 templates/popups.html |   6 +-
 4 files changed, 130 insertions(+), 59 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index f58d949a..7497dfb9 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -70,7 +70,7 @@ from utils import debounce
 import utils
 import koboldai_settings
 import torch
-from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, modeling_utils, AutoModelForTokenClassification
+from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, modeling_utils, AutoModelForTokenClassification, LlamaTokenizer
 from transformers import __version__ as transformers_version
 import transformers
 try:
@@ -1114,14 +1114,20 @@ def device_config(config):
         koboldai_vars.usegpu = False
         return
 
-def move_model_to_devices(model):
+def move_model_to_devices(model, use_4_bit=False):
     global generator
 
     if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel):
         if(koboldai_vars.usegpu):
-            model = model.to(koboldai_vars.gpu_device)
+            if not use_4_bit:
+                model = model.half().to(koboldai_vars.gpu_device)
+            else:
+                model = model.to(koboldai_vars.gpu_device)
         else:
-            model = model.to('cpu')
+            if not use_4_bit:
+                model = model.to('cpu').float()
+            else:
+                model = model.to('cpu')
         generator = model.generate
         return
 
@@ -1149,6 +1155,8 @@ def move_model_to_devices(model):
         generator = model.generate
         return
 
+    if not use_4_bit:
+        model.half()
     gc.collect()
 
     if(hasattr(model, "transformer")):
@@ -1518,11 +1526,6 @@ def general_startup(override_args=None):
     parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen")
     parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen")
 
-    # 4-bit stuff
-    parser.add_argument('--gptj4bit', help="Load a GPT-J model 4-bit pt file with this path")
-    parser.add_argument('--gptneox4bit', help="Load a GPT-NeoX model 4-bit pt file with this path")
-    parser.add_argument('--llama4bit', help="Load a Llama model 4-bit pt file with this path")
-
     #args: argparse.Namespace = None
     if "pytest" in sys.modules and override_args is None:
         args = parser.parse_args([])
@@ -1626,11 +1629,6 @@ def general_startup(override_args=None):
     koboldai_vars.smanrename = koboldai_vars.host == args.override_rename
 
     koboldai_vars.aria2_port = args.aria2_port or 6799
-
-    global vars_4bit
-    vars_4bit["gptj4bit"] = args.gptj4bit
-    vars_4bit["gptneox4bit"] = args.gptneox4bit
-    vars_4bit["llama4bit"] = args.llama4bit
     
     #Now let's look to see if we are going to force a load of a model from a user selected folder
     if(koboldai_vars.model == "selectfolder"):
@@ -1777,6 +1775,7 @@ def get_model_info(model, directory=""):
                          'break_values': break_values, 'gpu_count': gpu_count,
                          'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select,
                          'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False,
+                         'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False,
                          'show_custom_model_box': show_custom_model_box})
     if send_horde_models:
         get_cluster_models({'key': key_value, 'url': default_url})
@@ -1918,6 +1917,18 @@ def get_cluster_models(msg):
     emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2")
 
 
+@socketio.on("use_4_bit_toggle")
+def use_4_bit_toggle(msg):
+    # Disable lazy_load and breakmodel
+    if msg["use_4_bit"]:
+        koboldai_vars.lazy_load = False
+        koboldai_vars.nobreakmodel = True
+    else:
+        koboldai_vars.lazy_load = True
+        koboldai_vars.nobreakmodel = False
+
+    # TODO: Reload JS values for this stuff
+
 # Function to patch transformers to use our soft prompt
 def patch_causallm(model):
     from torch.nn import Embedding
@@ -2647,7 +2658,7 @@ def unload_model():
     koboldai_vars.badwordsids = koboldai_settings.badwordsids_default
     
     
-def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False):
+def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
     global model
     global generator
     global torch
@@ -2684,7 +2695,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
         disk_layers = args.breakmodel_disklayers
     if breakmodel_args_default_to_cpu and disk_layers is None:
         disk_layers = args.breakmodel_disklayers = 0
-    
+
     unload_model()
     
     if online_model == "":
@@ -2904,10 +2915,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                     @functools.lru_cache(maxsize=None)
                     def get_original_key(key):
-                        try:
-                            return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
-                        except ValueError:
-                            return key
+                        # try:
+                        return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
+                        # except ValueError:
+                        #     return key
 
                     for key, value in model_dict.items():
                         original_key = get_original_key(key)
@@ -2970,10 +2981,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
                                 #print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
                                 model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
-                                # if model_dict[key].dtype is torch.float32:
-                                #     koboldai_vars.fp32_model = True
-                                # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
-                                #     model_dict[key] = model_dict[key].to(torch.float16)
+                                if not use_4_bit:
+                                    if model_dict[key].dtype is torch.float32:
+                                        koboldai_vars.fp32_model = True
+                                    if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
+                                        model_dict[key] = model_dict[key].to(torch.float16)
                                 if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
                                     model_dict[key] = model_dict[key].to(torch.float32)
                                 if device == "shared":
@@ -2997,16 +3009,17 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 if utils.offload_index:
                                     for name, tensor in utils.named_buffers:
                                         dtype = tensor.dtype
-                                        # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
-                                        #     dtype = torch.float16
-                                        # if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
-                                        #     dtype = torch.float32
-                                        # if name in model_dict and model_dict[name].dtype is not dtype:
-                                        #     model_dict[name] = model_dict[name].to(dtype)
-                                        # if tensor.dtype is not dtype:
-                                        #     tensor = tensor.to(dtype)
-                                        # if name not in utils.offload_index:
-                                        #     accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
+                                        if not use_4_bit:
+                                            if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
+                                                dtype = torch.float16
+                                            if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
+                                                dtype = torch.float32
+                                            if name in model_dict and model_dict[name].dtype is not dtype:
+                                                model_dict[name] = model_dict[name].to(dtype)
+                                            if tensor.dtype is not dtype:
+                                                tensor = tensor.to(dtype)
+                                            if name not in utils.offload_index:
+                                                accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
                                     accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
                                 utils.bar.close()
                                 utils.bar = None
@@ -3065,10 +3078,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 koboldai_vars.modeldim = get_hidden_size_from_model(model)
                 # Is CUDA available? If so, use GPU, otherwise fall back to CPU
                 if(koboldai_vars.hascuda and koboldai_vars.usegpu):
-                    model = model.to(koboldai_vars.gpu_device)
+                    if not use_4_bit:
+                        model = model.half().to(koboldai_vars.gpu_device)
+                    else:
+                        model = model.to(koboldai_vars.gpu_device)
                     generator = model.generate
                 else:
-                    model = model.to('cpu')
+                    if not use_4_bit:
+                        model = model.to('cpu').float()
+                    else:
+                        model = model.to('cpu')
                     generator = model.generate
                 patch_causallm(model)
             # Use the Generic implementation
@@ -3105,17 +3124,26 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(koboldai_vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                         lowmem = {}
                     if(os.path.isdir(koboldai_vars.custmodpth)):
-                        global vars_4bit
 
-                        if vars_4bit.get("gptj4bit"):
-                            model = gptj_load_quant(koboldai_vars.custmodpth, vars_4bit["gptj4bit"], 4)
-                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
-                        elif vars_4bit.get("gptneox4bit"):
-                            model = gptneox_load_quant(koboldai_vars.custmodpth, vars_4bit["gptneox4bit"], 4)
-                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
-                        elif vars_4bit.get("llama4bit"):
-                            model = llama_load_quant(koboldai_vars.custmodpth, vars_4bit["llama4bit"], 4)
-                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                        path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
+
+                        if not os.path.isfile(path_4bit):
+                            print(f"4-bit file {path_4bit} not found, aborting 4-bit load")
+                            use_4_bit = False
+
+                        if use_4_bit:
+                            print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
+                            if koboldai_vars.model_type == "gptj":
+                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                            elif koboldai_vars.model_type == "gpt_neox":
+                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                            elif koboldai_vars.model_type == "llama":
+                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                            else:
+                                raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")
                         else:
                             try:
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
@@ -3185,6 +3213,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             import shutil
                             tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
                             if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)):  # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case
+                                if not use_4_bit:
+                                    model = model.half()
                                 model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
                             else:  # For fp16 models, we can just copy the model files directly
                                 import transformers.configuration_utils
@@ -3218,27 +3248,36 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 if(koboldai_vars.hascuda):
                     if(koboldai_vars.usegpu):
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
-                        model = model.to(koboldai_vars.gpu_device)
+                        if not use_4_bit:
+                            model = model.half().to(koboldai_vars.gpu_device)
+                        else:
+                            model = model.to(koboldai_vars.gpu_device)
                         generator = model.generate
                     elif(koboldai_vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         if(not koboldai_vars.lazy_load):
                             device_config(model.config)
-                        move_model_to_devices(model)
+                        move_model_to_devices(model, use_4_bit)
                     elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
-                        move_model_to_devices(model)
+                        move_model_to_devices(model, use_4_bit)
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         generator = model.generate
                     else:
-                        model = model.to('cpu')
+                        if not use_4_bit:
+                            model.to('cpu').float()
+                        else:
+                            model.to('cpu')
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         generator = model.generate
                 elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
-                    move_model_to_devices(model)
+                    move_model_to_devices(model, use_4_bit)
                     koboldai_vars.modeldim = get_hidden_size_from_model(model)
                     generator = model.generate
                 else:
-                    model.to('cpu')
+                    if not use_4_bit:
+                        model.to('cpu').float()
+                    else:
+                        model.to('cpu')
                     koboldai_vars.modeldim = get_hidden_size_from_model(model)
                     generator = model.generate
             
@@ -8784,7 +8823,7 @@ def UI_2_load_model(data):
     koboldai_vars.model = data['model']
     koboldai_vars.custmodpth = data['path']
     print("loading Model")
-    load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'])
+    load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'], use_4_bit=data['use_4_bit'])
 
 #==================================================================#
 # Event triggered when load story is clicked
diff --git a/koboldai_settings.py b/koboldai_settings.py
index 95caec0c..16cc8128 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -1207,7 +1207,7 @@ class system_settings(settings):
                          'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 
                          'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 
                          'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model',
-                         'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states']
+                         'generating_image', 'bit_8_available', 'bit_4_available', 'host', 'hascuda', 'usegpu', 'rng_states']
     settings_name = "system"
     def __init__(self, socketio, koboldai_var):
         self._socketio = socketio
@@ -1302,6 +1302,8 @@ class system_settings(settings):
                     elif torch.cuda.get_device_properties(device).major == 7 and torch.cuda.get_device_properties(device).minor >= 2:
                         self.bit_8_available = True
                         break
+        # Check if repos/gptq exists for 4-bit mode
+        self.bit_4_available = os.path.isdir("repos/gptq")
         self.seen_messages = []
         
         
@@ -2744,4 +2746,4 @@ default_preset = {
         ]
     }
 badwordsids_default = [[6880], [50256], [42496], [4613], [17414], [22039], [16410], [27], [29], [38430], [37922], [15913], [24618], [28725], [58], [47175], [36937], [26700], [12878], [16471], [37981], [5218], [29795], [13412], [45160], [3693], [49778], [4211], [20598], [36475], [33409], [44167], [32406], [29847], [29342], [42669], [685], [25787], [7359], [3784], [5320], [33994], [33490], [34516], [43734], [17635], [24293], [9959], [23785], [21737], [28401], [18161], [26358], [32509], [1279], [38155], [18189], [26894], [6927], [14610], [23834], [11037], [14631], [26933], [46904], [22330], [25915], [47934], [38214], [1875], [14692], [41832], [13163], [25970], [29565], [44926], [19841], [37250], [49029], [9609], [44438], [16791], [17816], [30109], [41888], [47527], [42924], [23984], [49074], [33717], [31161], [49082], [30138], [31175], [12240], [14804], [7131], [26076], [33250], [3556], [38381], [36338], [32756], [46581], [17912], [49146]] # Tokenized array of badwords used to prevent AI artifacting
-badwordsids_neox = [[0], [1], [44162], [9502], [12520], [31841], [36320], [49824], [34417], [6038], [34494], [24815], [26635], [24345], [3455], [28905], [44270], [17278], [32666], [46880], [7086], [43189], [37322], [17778], [20879], [49821], [3138], [14490], [4681], [21391], [26786], [43134], [9336], [683], [48074], [41256], [19181], [29650], [28532], [36487], [45114], [46275], [16445], [15104], [11337], [1168], [5647], [29], [27482], [44965], [43782], [31011], [42944], [47389], [6334], [17548], [38329], [32044], [35487], [2239], [34761], [7444], [1084], [12399], [18990], [17636], [39083], [1184], [35830], [28365], [16731], [43467], [47744], [1138], [16079], [40116], [45564], [18297], [42368], [5456], [18022], [42696], [34476], [23505], [23741], [39334], [37944], [45382], [38709], [33440], [26077], [43600], [34418], [36033], [6660], [48167], [48471], [15775], [19884], [41533], [1008], [31053], [36692], [46576], [20095], [20629], [31759], [46410], [41000], [13488], [30952], [39258], [16160], [27655], [22367], [42767], [43736], [49694], [13811], [12004], [46768], [6257], [37471], [5264], [44153], [33805], [20977], [21083], [25416], [14277], [31096], [42041], [18331], [33376], [22372], [46294], [28379], [38475], [1656], [5204], [27075], [50001], [16616], [11396], [7748], [48744], [35402], [28120], [41512], [4207], [43144], [14767], [15640], [16595], [41305], [44479], [38958], [18474], [22734], [30522], [46267], [60], [13976], [31830], [48701], [39822], [9014], [21966], [31422], [28052], [34607], [2479], [3851], [32214], [44082], [45507], [3001], [34368], [34758], [13380], [38363], [4299], [46802], [30996], [12630], [49236], [7082], [8795], [5218], [44740], [9686], [9983], [45301], [27114], [40125], [1570], [26997], [544], [5290], [49193], [23781], [14193], [40000], [2947], [43781], [9102], [48064], [42274], [18772], [49384], [9884], [45635], [43521], [31258], [32056], [47686], [21760], [13143], [10148], [26119], [44308], [31379], [36399], [23983], [46694], [36134], [8562], [12977], [35117], [28591], [49021], [47093], [28653], [29013], [46468], [8605], [7254], [25896], [5032], [8168], [36893], [38270], [20499], [27501], [34419], [29547], [28571], [36586], [20871], [30537], [26842], [21375], [31148], [27618], [33094], [3291], [31789], [28391], [870], [9793], [41361], [47916], [27468], [43856], [8850], [35237], [15707], [47552], [2730], [41449], [45488], [3073], [49806], [21938], [24430], [22747], [20924], [46145], [20481], [20197], [8239], [28231], [17987], [42804], [47269], [29972], [49884], [21382], [46295], [36676], [34616], [3921], [26991], [27720], [46265], [654], [9855], [40354], [5291], [34904], [44342], [2470], [14598], [880], [19282], [2498], [24237], [21431], [16369], [8994], [44524], [45662], [13663], [37077], [1447], [37786], [30863], [42854], [1019], [20322], [4398], [12159], [44072], [48664], [31547], [18736], [9259], [31], [16354], [21810], [4357], [37982], [5064], [2033], [32871], [47446], [62], [22158], [37387], [8743], [47007], [17981], [11049], [4622], [37916], [36786], [35138], [29925], [14157], [18095], [27829], [1181], [22226], [5709], [4725], [30189], [37014], [1254], [11380], [42989], [696], [24576], [39487], [30119], [1092], [8088], [2194], [9899], [14412], [21828], [3725], [13544], [5180], [44679], [34398], [3891], [28739], [14219], [37594], [49550], [11326], [6904], [17266], [5749], [10174], [23405], [9955], [38271], [41018], [13011], [48392], [36784], [24254], [21687], [23734], [5413], [41447], [45472], [10122], [17555], [15830], [47384], [12084], [31350], [47940], [11661], [27988], [45443], [905], [49651], [16614], [34993], [6781], [30803], [35869], [8001], [41604], [28118], [46462], [46762], [16262], [17281], [5774], [10943], [5013], [18257], [6750], [4713], [3951], [11899], [38791], [16943], [37596], [9318], [18413], [40473], [13208], [16375]]
\ No newline at end of file
+badwordsids_neox = [[0], [1], [44162], [9502], [12520], [31841], [36320], [49824], [34417], [6038], [34494], [24815], [26635], [24345], [3455], [28905], [44270], [17278], [32666], [46880], [7086], [43189], [37322], [17778], [20879], [49821], [3138], [14490], [4681], [21391], [26786], [43134], [9336], [683], [48074], [41256], [19181], [29650], [28532], [36487], [45114], [46275], [16445], [15104], [11337], [1168], [5647], [29], [27482], [44965], [43782], [31011], [42944], [47389], [6334], [17548], [38329], [32044], [35487], [2239], [34761], [7444], [1084], [12399], [18990], [17636], [39083], [1184], [35830], [28365], [16731], [43467], [47744], [1138], [16079], [40116], [45564], [18297], [42368], [5456], [18022], [42696], [34476], [23505], [23741], [39334], [37944], [45382], [38709], [33440], [26077], [43600], [34418], [36033], [6660], [48167], [48471], [15775], [19884], [41533], [1008], [31053], [36692], [46576], [20095], [20629], [31759], [46410], [41000], [13488], [30952], [39258], [16160], [27655], [22367], [42767], [43736], [49694], [13811], [12004], [46768], [6257], [37471], [5264], [44153], [33805], [20977], [21083], [25416], [14277], [31096], [42041], [18331], [33376], [22372], [46294], [28379], [38475], [1656], [5204], [27075], [50001], [16616], [11396], [7748], [48744], [35402], [28120], [41512], [4207], [43144], [14767], [15640], [16595], [41305], [44479], [38958], [18474], [22734], [30522], [46267], [60], [13976], [31830], [48701], [39822], [9014], [21966], [31422], [28052], [34607], [2479], [3851], [32214], [44082], [45507], [3001], [34368], [34758], [13380], [38363], [4299], [46802], [30996], [12630], [49236], [7082], [8795], [5218], [44740], [9686], [9983], [45301], [27114], [40125], [1570], [26997], [544], [5290], [49193], [23781], [14193], [40000], [2947], [43781], [9102], [48064], [42274], [18772], [49384], [9884], [45635], [43521], [31258], [32056], [47686], [21760], [13143], [10148], [26119], [44308], [31379], [36399], [23983], [46694], [36134], [8562], [12977], [35117], [28591], [49021], [47093], [28653], [29013], [46468], [8605], [7254], [25896], [5032], [8168], [36893], [38270], [20499], [27501], [34419], [29547], [28571], [36586], [20871], [30537], [26842], [21375], [31148], [27618], [33094], [3291], [31789], [28391], [870], [9793], [41361], [47916], [27468], [43856], [8850], [35237], [15707], [47552], [2730], [41449], [45488], [3073], [49806], [21938], [24430], [22747], [20924], [46145], [20481], [20197], [8239], [28231], [17987], [42804], [47269], [29972], [49884], [21382], [46295], [36676], [34616], [3921], [26991], [27720], [46265], [654], [9855], [40354], [5291], [34904], [44342], [2470], [14598], [880], [19282], [2498], [24237], [21431], [16369], [8994], [44524], [45662], [13663], [37077], [1447], [37786], [30863], [42854], [1019], [20322], [4398], [12159], [44072], [48664], [31547], [18736], [9259], [31], [16354], [21810], [4357], [37982], [5064], [2033], [32871], [47446], [62], [22158], [37387], [8743], [47007], [17981], [11049], [4622], [37916], [36786], [35138], [29925], [14157], [18095], [27829], [1181], [22226], [5709], [4725], [30189], [37014], [1254], [11380], [42989], [696], [24576], [39487], [30119], [1092], [8088], [2194], [9899], [14412], [21828], [3725], [13544], [5180], [44679], [34398], [3891], [28739], [14219], [37594], [49550], [11326], [6904], [17266], [5749], [10174], [23405], [9955], [38271], [41018], [13011], [48392], [36784], [24254], [21687], [23734], [5413], [41447], [45472], [10122], [17555], [15830], [47384], [12084], [31350], [47940], [11661], [27988], [45443], [905], [49651], [16614], [34993], [6781], [30803], [35869], [8001], [41604], [28118], [46462], [46762], [16262], [17281], [5774], [10943], [5013], [18257], [6750], [4713], [3951], [11899], [38791], [16943], [37596], [9318], [18413], [40473], [13208], [16375]]
diff --git a/static/koboldai.js b/static/koboldai.js
index cce66f80..05dcc47e 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -1472,6 +1472,7 @@ function show_model_menu(data) {
 	document.getElementById("modelurl").classList.add("hidden");
 	document.getElementById("use_gpu_div").classList.add("hidden");
 	document.getElementById("use_8_bit_div").classList.add("hidden");
+	document.getElementById("use_4_bit_div").classList.add("hidden");
 	document.getElementById("modellayers").classList.add("hidden");
 	document.getElementById("oaimodel").classList.add("hidden");
 	var model_layer_bars = document.getElementById('model_layer_bars');
@@ -1646,6 +1647,14 @@ function selected_model_info(data) {
 		document.getElementById("use_8_bit").checked = false;
 	}
 	
+	//hide or unhide 4 bit mode
+	if (data.bit_4_available) {
+		document.getElementById("use_4_bit_div").classList.remove("hidden");
+	} else {
+		document.getElementById("use_4_bit_div").classList.add("hidden");
+		document.getElementById("use_4_bit").checked = false;
+	}
+
 	//default URL loading
 	if (data.default_url != null) {
 		document.getElementById("modelurl").value = data.default_url;
@@ -1815,7 +1824,7 @@ function selected_model_info(data) {
 	}
 	accept.disabled = false;
 	
-	
+	set_4_bit_mode(invert=false);
 }
 
 function update_gpu_layers() {
@@ -1876,7 +1885,8 @@ function load_model() {
 			   'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(), 
 			   'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value, 
 			   'online_model': selected_models,
-			   'use_8_bit': document.getElementById('use_8_bit').checked};
+			   'use_8_bit': document.getElementById('use_8_bit').checked,
+			   'use_4_bit': document.getElementById('use_4_bit').checked};
 	socket.emit("load_model", message);
 	closePopups();
 }
@@ -3160,6 +3170,22 @@ function save_preset() {
 	closePopups();
 }
 
+function set_4_bit_mode(invert=true) {
+	bit_4_status = document.getElementById("use_4_bit").checked;
+	if (invert) {
+	bit_4_status = !bit_4_status;
+	}
+	if (bit_4_status) {
+		document.getElementById("modellayers").classList.add("hidden");
+		socket.emit("use_4_bit_toggle", {"use_4_bit": false});
+	} else {
+		document.getElementById("modellayers").classList.remove("hidden");
+		socket.emit("use_4_bit_toggle", {"use_4_bit": true});
+	}
+}
+
+
+
 //--------------------------------------------General UI Functions------------------------------------
 function set_ui_level(level) {
 	for (classname of ['setting_container', 'setting_container_single', 'setting_container_single_wide', 'biasing', 'palette_area', 'advanced_theme']) {
@@ -7301,4 +7327,4 @@ $el("#gamescreen").addEventListener("paste", function(event) {
 		false,
 		event.clipboardData.getData("text/plain")
 	);
-});
\ No newline at end of file
+});
diff --git a/templates/popups.html b/templates/popups.html
index 44cf7cb6..804b1b9f 100644
--- a/templates/popups.html
+++ b/templates/popups.html
@@ -75,6 +75,10 @@
 				<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_8_bit" checked>
 				<div class="box-label">Use 8 bit mode</div>
 			</div>
+			<div class="box flex-push-right hidden" id=use_4_bit_div>
+				<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_4_bit" checked>
+				<div class="box-label">Use 4 bit mode</div>
+			</div>
 			<button type="button" class="btn popup_load_cancel_button action_button disabled" onclick="load_model()" id="btn_loadmodelaccept" disabled>Load</button>
 			<button type="button" class="btn popup_load_cancel_button" onclick='closePopups();' id="btn_loadmodelclose">Cancel</button>
 		</div>
@@ -402,4 +406,4 @@
 	</div>
 </div>
 
-<div id="notification-container"></div>
\ No newline at end of file
+<div id="notification-container"></div>

From c7edc764b95d44603e4d450d4326ce3628188ef3 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 21 Mar 2023 21:58:31 +0000
Subject: [PATCH 012/113] Fix llama loading

---
 aiserver.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 7497dfb9..967af85f 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2915,10 +2915,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                     @functools.lru_cache(maxsize=None)
                     def get_original_key(key):
-                        # try:
-                        return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
-                        # except ValueError:
-                        #     return key
+                        try:
+                            return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
+                        except ValueError:
+                            return key
 
                     for key, value in model_dict.items():
                         original_key = get_original_key(key)
@@ -3128,8 +3128,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
 
                         if not os.path.isfile(path_4bit):
-                            print(f"4-bit file {path_4bit} not found, aborting 4-bit load")
-                            use_4_bit = False
+                            print(f"4-bit file {path_4bit} not found, loading failed")
+                            raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
 
                         if use_4_bit:
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
@@ -3155,7 +3155,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                         tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
                                     except Exception as e:
                                         tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
-                            model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                            model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
 
                         if model is None:
                             raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")

From 8941428c66c377baa10aa95afd3186733dd92b89 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 22 Mar 2023 06:22:34 +0000
Subject: [PATCH 013/113] Fix Kobold loading to CPU in 4-bit, causing CUDA
 ASSERT error

---
 aiserver.py | 6 ++++--
 repos/gptq  | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 967af85f..2c50cfcc 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3102,7 +3102,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 
                 # If we're using torch_lazy_loader, we need to get breakmodel config
                 # early so that it knows where to load the individual model tensors
-                if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel:
+                if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel and not use_4_bit:
                     device_config(model_config)
 
                 # Download model from Huggingface if it does not exist, otherwise load locally
@@ -3133,6 +3133,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                         if use_4_bit:
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
+                            koboldai_vars.breakmodel = False
+                            koboldai_vars.usegpu = True
                             if koboldai_vars.model_type == "gptj":
                                 model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
@@ -3255,7 +3257,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         generator = model.generate
                     elif(koboldai_vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
-                        if(not koboldai_vars.lazy_load):
+                        if(not koboldai_vars.lazy_load and not use_4_bit):
                             device_config(model.config)
                         move_model_to_devices(model, use_4_bit)
                     elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
diff --git a/repos/gptq b/repos/gptq
index a8303654..791cfe37 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit a8303654c200c25577130466e5f9bc1e70fc8a50
+Subproject commit 791cfe376af33aa01032dd52147050083a6345cf

From 026eb3205e0f48dac5a4aa965d3e48d79ec5e1ab Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 22 Mar 2023 22:12:06 +0000
Subject: [PATCH 014/113] Fix 4-bit loading error when not loading in 4-bit

---
 aiserver.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 2c50cfcc..745a7cb8 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3125,13 +3125,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         lowmem = {}
                     if(os.path.isdir(koboldai_vars.custmodpth)):
 
-                        path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
-
-                        if not os.path.isfile(path_4bit):
-                            print(f"4-bit file {path_4bit} not found, loading failed")
-                            raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
-
                         if use_4_bit:
+                            path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
+
+                            if not os.path.isfile(path_4bit):
+                                print(f"4-bit file {path_4bit} not found, loading failed")
+                                raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
+
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
                             koboldai_vars.breakmodel = False
                             koboldai_vars.usegpu = True

From 9dcba3897810499786d1fb4b4bd8d41ef595a130 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 24 Mar 2023 19:07:28 +0000
Subject: [PATCH 015/113] Pin transformers to a working Llama-compatible
 version

---
 environments/huggingface.yml | 2 +-
 environments/rocm.yml        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 42dda9c3..6807627e 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -30,7 +30,7 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - git+https://github.com/zphang/transformers@llama_push
+    - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24
     - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
     - huggingface_hub==0.12.1
     - safetensors
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 43fd331f..a1d3d8b0 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -29,7 +29,7 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - transformers==4.25.1
+    - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24
     - huggingface_hub==0.12.1
     - safetensors
     - accelerate

From 2e7a8a1a66a3813ff2f68b5e37f659479f44afc2 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Thu, 23 Mar 2023 05:53:30 +0000
Subject: [PATCH 016/113] Adapt KoboldAI to latest gptq changes

---
 aiserver.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 745a7cb8..faee85c0 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3136,13 +3136,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             koboldai_vars.breakmodel = False
                             koboldai_vars.usegpu = True
                             if koboldai_vars.model_type == "gptj":
-                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "gpt_neox":
-                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "llama":
-                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")

From d1a2005a2710e0720fe2a863ebe4f5d1f9b2ad18 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 27 Mar 2023 20:45:21 +0000
Subject: [PATCH 017/113] Add support for old and new 4-bit format. Old one
 needs 4bit-old.pt file to launch

---
 aiserver.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index faee85c0..fa2af0f3 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -94,7 +94,6 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
 from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
-vars_4bit = {}
 
 
 if lupa.LUA_VERSION[:2] != (5, 4):
@@ -3127,9 +3126,29 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                         if use_4_bit:
                             path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
+                            path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt")
+
+                            # Monkey-patch in old-format pt-file support
+                            if not os.path.isfile(path_4bit):
+                                print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}")
+                                path_4bit = path_4bit_old
+
+                                import llama, opt, gptneox, gptj, old_quant, quant_cuda_old
+                                llama.make_quant = old_quant.old_make_quant
+                                opt.make_quant = old_quant.old_make_quant
+                                gptneox.make_quant = old_quant.old_make_quant
+                                gptj.make_quant = old_quant.old_make_quant
+                            elif llama.make_quant == old_quant.old_make_quant:
+                                # Undo monkey patch
+                                import quant
+                                llama.make_quant = quant.make_quant
+                                opt.make_quant = quant.make_quant
+                                gptneox.make_quant = quant.make_quant
+                                gptj.make_quant = quant.make_quant
+
 
                             if not os.path.isfile(path_4bit):
-                                print(f"4-bit file {path_4bit} not found, loading failed")
+                                print(f"4-bit old-format file {path_4bit} not found, loading failed")
                                 raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
 
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")

From 0f1fc46078f9a751e35c0c5e7e35d091a10f3f9b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 27 Mar 2023 21:30:43 +0000
Subject: [PATCH 018/113] Fix errors during inference

---
 aiserver.py | 14 +++++++++++---
 repos/gptq  |  2 +-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index fa2af0f3..2c2eff1b 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -94,6 +94,7 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
 from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
+monkey_patched_4bit = False
 
 
 if lupa.LUA_VERSION[:2] != (5, 4):
@@ -3128,23 +3129,28 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
                             path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt")
 
+                            global monkey_patched_4bit
+
                             # Monkey-patch in old-format pt-file support
                             if not os.path.isfile(path_4bit):
                                 print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}")
                                 path_4bit = path_4bit_old
 
-                                import llama, opt, gptneox, gptj, old_quant, quant_cuda_old
+                                import llama, opt, gptneox, gptj, old_quant
                                 llama.make_quant = old_quant.old_make_quant
                                 opt.make_quant = old_quant.old_make_quant
                                 gptneox.make_quant = old_quant.old_make_quant
                                 gptj.make_quant = old_quant.old_make_quant
-                            elif llama.make_quant == old_quant.old_make_quant:
+                                monkey_patched_4bit = True
+                            elif monkey_patched_4bit:
                                 # Undo monkey patch
-                                import quant
+                                print("Undoing 4-bit old format monkey patch")
+                                import llama, opt, gptneox, gptj, quant
                                 llama.make_quant = quant.make_quant
                                 opt.make_quant = quant.make_quant
                                 gptneox.make_quant = quant.make_quant
                                 gptj.make_quant = quant.make_quant
+                                monkey_patched_4bit = False
 
 
                             if not os.path.isfile(path_4bit):
@@ -3165,6 +3171,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")
+
+                            model = model.float()
                         else:
                             try:
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
diff --git a/repos/gptq b/repos/gptq
index 791cfe37..0748a680 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 791cfe376af33aa01032dd52147050083a6345cf
+Subproject commit 0748a680e95ab0a9f8860953a5d705a01070d1cc

From ef6fe680a97efb740db946c0e4fbf5d2dd54889b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 28 Mar 2023 06:30:02 +0000
Subject: [PATCH 019/113] Fix high VRAM usage caused by workaround for scalar
 type error

---
 aiserver.py | 2 +-
 repos/gptq  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 2c2eff1b..27cafd59 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3172,7 +3172,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")
 
-                            model = model.float()
+                            model = model.half()
                         else:
                             try:
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
diff --git a/repos/gptq b/repos/gptq
index 0748a680..5d07f25a 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 0748a680e95ab0a9f8860953a5d705a01070d1cc
+Subproject commit 5d07f25a30f8602aedb3e69f11de07624e486ce9

From e698f22706c806e05fdd8c58f91f3d560bcba0d6 Mon Sep 17 00:00:00 2001
From: Digitous <107712289+Digitous@users.noreply.github.com>
Date: Tue, 28 Mar 2023 19:14:46 -0400
Subject: [PATCH 020/113] Update README.md

---
 README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/README.md b/README.md
index 20a1957a..c6e922aa 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,57 @@
+## This is a fork of KoboldAI that implements 4bit GPTQ quantized support to include Llama.
+
+### Install/Use Guide
+(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
+
+In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
+
+git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules
+
+cd KoboldAI
+
+Next step, subfolder mode or B: option doesn't matter choose either
+
+[if on Windows] install_requirements.bat if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
+
+[if on Linux] install_requirements.sh
+
+
+[if on Windows] run commandline.bat
+
+[if on Linux] run commandline.sh
+
+commandline.bat/commandline.sh will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt).
+
+
+cd repos
+
+cd gptq
+
+
+[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] python setup_cuda.py install
+
+[if on Linux] python setup_cuda.py install
+
+After the Cuda kernel is compiled, return to KoboldAI base directory
+
+[if on Windows (only applies to windows users)] pip install flask_cors
+
+If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
+
+Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
+
+Then move your model folder to KoboldAI/models, and rename the .pt in your model folder to 4bit.pt
+
+So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
+
+If you haven't done so already, exit the command prompt/leave KAI's (base) venv
+
+Run play.bat [windows] or play.sh [linux]
+
+Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on.
+
+The 4bit toggle shows when a model to load is selected.
+
 ## KoboldAI - Your gateway to GPT writing
 
 This is a browser-based front-end for AI-assisted writing with multiple local & remote AI models. It offers the standard array of tools, including Memory, Author's Note, World Info, Save & Load, adjustable AI settings, formatting options, and the ability to import existing AI Dungeon adventures. You can also turn on Adventure mode and play the game like AI Dungeon Unleashed.

From 8d008b87a608beb47e5f41473a40b437aa33d4b4 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 13:25:06 +0000
Subject: [PATCH 021/113] Add OPT support

---
 aiserver.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/aiserver.py b/aiserver.py
index 27cafd59..edce6bf1 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -94,6 +94,7 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
 from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
+from opt import load_quant as opt_load_quant
 monkey_patched_4bit = False
 
 
@@ -3169,6 +3170,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             elif koboldai_vars.model_type == "llama":
                                 model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                            elif koboldai_vars.model_type == "opt":
+                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")
 

From f6f7687cc015821c4d4b4cff7dbfea1052514efb Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 14:47:59 +0000
Subject: [PATCH 022/113] Add 4bit safetensor support, improve loading code

---
 aiserver.py | 78 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 31 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index edce6bf1..2679ddc8 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -90,6 +90,7 @@ global tpu_mtj_backend
 
 # 4-bit dependencies
 from pathlib import Path
+import glob
 sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
 from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
@@ -2657,6 +2658,50 @@ def unload_model():
         
     #Reload our badwords
     koboldai_vars.badwordsids = koboldai_settings.badwordsids_default
+
+
+def prepare_4bit_load(modelpath):
+    paths_4bit = ["4bit.pt", "4bit.safetensors"]
+    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
+    result = False
+    for p in paths_4bit:
+        p = os.path.join(modelpath, p)
+        if os.path.isfile(p):
+            result = p
+            break
+
+    global monkey_patched_4bit
+
+    # Monkey-patch in old-format pt-file support
+    if not result:
+        print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}")
+        for p in paths_4bit_old:
+            p = os.path.join(modelpath, p)
+            if os.path.isfile(p):
+                result = p
+                break
+
+        if not result:
+            print(f"4-bit old-format file {path_4bit} not found, loading failed")
+            raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
+
+        import llama, opt, gptneox, gptj, old_quant
+        llama.make_quant = old_quant.old_make_quant
+        opt.make_quant = old_quant.old_make_quant
+        gptneox.make_quant = old_quant.old_make_quant
+        gptj.make_quant = old_quant.old_make_quant
+        monkey_patched_4bit = True
+    elif monkey_patched_4bit:
+        # Undo monkey patch
+        print("Undoing 4-bit old format monkey patch")
+        import llama, opt, gptneox, gptj, quant
+        llama.make_quant = quant.make_quant
+        opt.make_quant = quant.make_quant
+        gptneox.make_quant = quant.make_quant
+        gptj.make_quant = quant.make_quant
+        monkey_patched_4bit = False
+
+    return result
     
     
 def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
@@ -3127,36 +3172,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(os.path.isdir(koboldai_vars.custmodpth)):
 
                         if use_4_bit:
-                            path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
-                            path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt")
-
-                            global monkey_patched_4bit
-
-                            # Monkey-patch in old-format pt-file support
-                            if not os.path.isfile(path_4bit):
-                                print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}")
-                                path_4bit = path_4bit_old
-
-                                import llama, opt, gptneox, gptj, old_quant
-                                llama.make_quant = old_quant.old_make_quant
-                                opt.make_quant = old_quant.old_make_quant
-                                gptneox.make_quant = old_quant.old_make_quant
-                                gptj.make_quant = old_quant.old_make_quant
-                                monkey_patched_4bit = True
-                            elif monkey_patched_4bit:
-                                # Undo monkey patch
-                                print("Undoing 4-bit old format monkey patch")
-                                import llama, opt, gptneox, gptj, quant
-                                llama.make_quant = quant.make_quant
-                                opt.make_quant = quant.make_quant
-                                gptneox.make_quant = quant.make_quant
-                                gptj.make_quant = quant.make_quant
-                                monkey_patched_4bit = False
-
-
-                            if not os.path.isfile(path_4bit):
-                                print(f"4-bit old-format file {path_4bit} not found, loading failed")
-                                raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
+                            path_4bit = prepare_4bit_load(koboldai_vars.custmodpth)
 
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
                             koboldai_vars.breakmodel = False
@@ -3171,7 +3187,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "opt":
-                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")

From a0bc77042624571b878d734ebc41331f6f4d9342 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 19:49:05 +0000
Subject: [PATCH 023/113] Add basic groupsize support

Write groupsize into filename, for example 4bit-128b.safetensors for groupsize 128
---
 aiserver.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 2679ddc8..38805287 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2661,13 +2661,19 @@ def unload_model():
 
 
 def prepare_4bit_load(modelpath):
-    paths_4bit = ["4bit.pt", "4bit.safetensors"]
+    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
     paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
     result = False
+    groupsize = -1
     for p in paths_4bit:
         p = os.path.join(modelpath, p)
-        if os.path.isfile(p):
-            result = p
+        val = glob.glob(p)
+        if val:
+            result = val[0]
+            fname = Path(result).parts[-1]
+            g = re.findall("^(?:4bit)(?:-)(\d+)(?:b-?)", fname)
+            if g:
+                groupsize = int(g[0])
             break
 
     global monkey_patched_4bit
@@ -2701,7 +2707,7 @@ def prepare_4bit_load(modelpath):
         gptj.make_quant = quant.make_quant
         monkey_patched_4bit = False
 
-    return result
+    return result, groupsize
     
     
 def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
@@ -3172,22 +3178,23 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(os.path.isdir(koboldai_vars.custmodpth)):
 
                         if use_4_bit:
-                            path_4bit = prepare_4bit_load(koboldai_vars.custmodpth)
+                            path_4bit, groupsize = prepare_4bit_load(koboldai_vars.custmodpth)
+                            print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
 
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
                             koboldai_vars.breakmodel = False
                             koboldai_vars.usegpu = True
                             if koboldai_vars.model_type == "gptj":
-                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
+                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "gpt_neox":
-                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
+                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "llama":
-                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
+                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "opt":
-                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
+                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")

From 73d5ec0e5dd234852a66331b681734e8beb13781 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 20:07:26 +0000
Subject: [PATCH 024/113] Pull latest gptq-changes

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index 5d07f25a..6f80e1fd 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 5d07f25a30f8602aedb3e69f11de07624e486ce9
+Subproject commit 6f80e1fdd7232d66e45e02ebf00be4c5d5f933bc

From 9d0477f5f73471995fa3e23789a0ac4aa9108b33 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 22:05:44 +0000
Subject: [PATCH 025/113] Fix bug where it picks old model despite new one
 available

---
 aiserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 38805287..812bc4a8 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2667,7 +2667,7 @@ def prepare_4bit_load(modelpath):
     groupsize = -1
     for p in paths_4bit:
         p = os.path.join(modelpath, p)
-        val = glob.glob(p)
+        val = [v for v in glob.glob(p) if "4bit-old" not in v]
         if val:
             result = val[0]
             fname = Path(result).parts[-1]

From 61b13604b6ad116561488ab146c3959f40d98099 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Thu, 30 Mar 2023 10:57:04 +0200
Subject: [PATCH 026/113] Fix bug in 4-bit load fallback

---
 aiserver.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 812bc4a8..fe0f9a8c 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2680,7 +2680,7 @@ def prepare_4bit_load(modelpath):
 
     # Monkey-patch in old-format pt-file support
     if not result:
-        print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}")
+        print("4-bit file not found, falling back to old format.")
         for p in paths_4bit_old:
             p = os.path.join(modelpath, p)
             if os.path.isfile(p):
@@ -2688,8 +2688,8 @@ def prepare_4bit_load(modelpath):
                 break
 
         if not result:
-            print(f"4-bit old-format file {path_4bit} not found, loading failed")
-            raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
+            print("4-bit old-format file not found, loading failed.")
+            raise RuntimeError(f"4-bit load failed. PT-File not found.")
 
         import llama, opt, gptneox, gptj, old_quant
         llama.make_quant = old_quant.old_make_quant

From aa2292b3a4dff467e9afaa3270d80fcda4c7994f Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 21:43:49 +0000
Subject: [PATCH 027/113] Enable multi-gpu support

---
 aiserver.py        | 50 ++++++++++++++++------------------------------
 static/koboldai.js |  9 +--------
 2 files changed, 18 insertions(+), 41 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index fe0f9a8c..7a4370c0 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1139,7 +1139,7 @@ def move_model_to_devices(model, use_4_bit=False):
         import accelerate.utils
         for key, value in model.state_dict().items():
             target_dtype = torch.float32 if breakmodel.primary_device == "cpu" else torch.float16
-            if(value.dtype is not target_dtype):
+            if(value.dtype not in (torch.bool, torch.int) and value.dtype is not target_dtype):
                 accelerate.utils.set_module_tensor_to_device(model, key, target_dtype)
         disk_blocks = breakmodel.disk_blocks
         gpu_blocks = breakmodel.gpu_blocks
@@ -1919,18 +1919,6 @@ def get_cluster_models(msg):
     emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2")
 
 
-@socketio.on("use_4_bit_toggle")
-def use_4_bit_toggle(msg):
-    # Disable lazy_load and breakmodel
-    if msg["use_4_bit"]:
-        koboldai_vars.lazy_load = False
-        koboldai_vars.nobreakmodel = True
-    else:
-        koboldai_vars.lazy_load = True
-        koboldai_vars.nobreakmodel = False
-
-    # TODO: Reload JS values for this stuff
-
 # Function to patch transformers to use our soft prompt
 def patch_causallm(model):
     from torch.nn import Embedding
@@ -3033,11 +3021,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
                                 #print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
                                 model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
-                                if not use_4_bit:
-                                    if model_dict[key].dtype is torch.float32:
-                                        koboldai_vars.fp32_model = True
-                                    if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
-                                        model_dict[key] = model_dict[key].to(torch.float16)
+                                if model_dict[key].dtype is torch.float32:
+                                    koboldai_vars.fp32_model = True
+                                if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
+                                    model_dict[key] = model_dict[key].to(torch.float16)
                                 if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
                                     model_dict[key] = model_dict[key].to(torch.float32)
                                 if device == "shared":
@@ -3061,17 +3048,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 if utils.offload_index:
                                     for name, tensor in utils.named_buffers:
                                         dtype = tensor.dtype
-                                        if not use_4_bit:
-                                            if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
-                                                dtype = torch.float16
-                                            if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
-                                                dtype = torch.float32
-                                            if name in model_dict and model_dict[name].dtype is not dtype:
-                                                model_dict[name] = model_dict[name].to(dtype)
-                                            if tensor.dtype is not dtype:
-                                                tensor = tensor.to(dtype)
-                                            if name not in utils.offload_index:
-                                                accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
+                                        if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
+                                            dtype = torch.float16
+                                        if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
+                                            dtype = torch.float32
+                                        if name in model_dict and model_dict[name].dtype is not dtype:
+                                            model_dict[name] = model_dict[name].to(dtype)
+                                        if tensor.dtype is not dtype:
+                                            tensor = tensor.to(dtype)
+                                        if name not in utils.offload_index:
+                                            accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
                                     accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
                                 utils.bar.close()
                                 utils.bar = None
@@ -3154,7 +3140,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 
                 # If we're using torch_lazy_loader, we need to get breakmodel config
                 # early so that it knows where to load the individual model tensors
-                if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel and not use_4_bit:
+                if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel:
                     device_config(model_config)
 
                 # Download model from Huggingface if it does not exist, otherwise load locally
@@ -3182,8 +3168,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
 
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
-                            koboldai_vars.breakmodel = False
-                            koboldai_vars.usegpu = True
                             if koboldai_vars.model_type == "gptj":
                                 model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
@@ -3311,7 +3295,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         generator = model.generate
                     elif(koboldai_vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
-                        if(not koboldai_vars.lazy_load and not use_4_bit):
+                        if(not koboldai_vars.lazy_load):
                             device_config(model.config)
                         move_model_to_devices(model, use_4_bit)
                     elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
diff --git a/static/koboldai.js b/static/koboldai.js
index 05dcc47e..89ee2ea1 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -3173,14 +3173,7 @@ function save_preset() {
 function set_4_bit_mode(invert=true) {
 	bit_4_status = document.getElementById("use_4_bit").checked;
 	if (invert) {
-	bit_4_status = !bit_4_status;
-	}
-	if (bit_4_status) {
-		document.getElementById("modellayers").classList.add("hidden");
-		socket.emit("use_4_bit_toggle", {"use_4_bit": false});
-	} else {
-		document.getElementById("modellayers").classList.remove("hidden");
-		socket.emit("use_4_bit_toggle", {"use_4_bit": true});
+		bit_4_status = !bit_4_status;
 	}
 }
 

From 6eae4574793687b517c45e85e5fc178015c8d088 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 31 Mar 2023 15:36:03 +0200
Subject: [PATCH 028/113] Fix 4bit groupsize param letter

Use g instead of b for groupsize name, for example 4bit-128g.safetensors
---
 aiserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 7a4370c0..e7c789ac 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2659,7 +2659,7 @@ def prepare_4bit_load(modelpath):
         if val:
             result = val[0]
             fname = Path(result).parts[-1]
-            g = re.findall("^(?:4bit)(?:-)(\d+)(?:b-?)", fname)
+            g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname)
             if g:
                 groupsize = int(g[0])
             break

From d3a5ca65057f4f7cf9a2998cd13e5e04de829df1 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 1 Apr 2023 08:52:08 +0000
Subject: [PATCH 029/113] Update gptq submodule to latest

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index 6f80e1fd..f4de1019 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 6f80e1fdd7232d66e45e02ebf00be4c5d5f933bc
+Subproject commit f4de1019fedce779915e050a517f4cc8dee432dc

From bf0c999412b48a6de6a174a33bce3f8b92df1e16 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 1 Apr 2023 14:19:51 +0200
Subject: [PATCH 030/113] Update GPTQ to support AMD

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index f4de1019..954b3218 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit f4de1019fedce779915e050a517f4cc8dee432dc
+Subproject commit 954b32183adda2acd437a3ab0683a28ca3c7e4c9

From 110f8229c565a1ac64060e4e1785d4563920d4f4 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 1 Apr 2023 21:33:05 +0200
Subject: [PATCH 031/113] Add cudatoolkit-dev for compilation, compatible gcc 9
 and update transformers to fix error in gptq

---
 environments/huggingface.yml | 5 ++++-
 environments/rocm.yml        | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 6807627e..71d26e9c 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -11,6 +11,9 @@ dependencies:
   - pytorch=1.11.*
   - python=3.8.*
   - cudatoolkit=11.1
+  - cudatoolkit-dev=11.1
+  - gcc=9.*
+  - gxx=9.*
   - eventlet=0.33.3
   - dnspython=2.2.1
   - markdown
@@ -30,7 +33,7 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24
+    - git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda
     - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
     - huggingface_hub==0.12.1
     - safetensors
diff --git a/environments/rocm.yml b/environments/rocm.yml
index a1d3d8b0..dda2a2b2 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -29,7 +29,7 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24
+    - git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda
     - huggingface_hub==0.12.1
     - safetensors
     - accelerate

From 2729b7764047b7c1d35f7a20e5900d61147fe598 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 2 Apr 2023 10:32:19 +0200
Subject: [PATCH 032/113] Add offload.py adapted from
 llama_inference_offload.py, with multi-gpu support and some improvements. Not
 yet functional, and still just supports Llama

---
 aiserver.py | 17 +++++++++++++++--
 repos/gptq  |  2 +-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index e7c789ac..82992461 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -96,6 +96,7 @@ from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
 from opt import load_quant as opt_load_quant
+from offload import load_quant_offload
 monkey_patched_4bit = False
 
 
@@ -3137,6 +3138,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 if(koboldai_vars.model_type == "gpt2"):
                     lowmem = {}
                     koboldai_vars.lazy_load = False  # Also, lazy loader doesn't support GPT-2 models
+
+                gpu_layers_list = [int(l) for l in gpu_layers.split(",")]
+                offload_4bit = use_4_bit and sum(gpu_layers_list) < utils.num_layers(model_config)
+
+                if offload_4bit:
+                    koboldai_vars.lazy_load = False
                 
                 # If we're using torch_lazy_loader, we need to get breakmodel config
                 # early so that it knows where to load the individual model tensors
@@ -3175,7 +3182,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "llama":
-                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                if offload_4bit:
+                                    model = load_quant_offload(llama_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                                else:
+                                    model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "opt":
                                 model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
@@ -3286,7 +3296,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 patch_causallm(model)
 
                 if(koboldai_vars.hascuda):
-                    if(koboldai_vars.usegpu):
+                    if offload_4bit:
+                        koboldai_vars.modeldim = get_hidden_size_from_model(model)
+                        generator = model.generate
+                    elif(koboldai_vars.usegpu):
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         if not use_4_bit:
                             model = model.half().to(koboldai_vars.gpu_device)
diff --git a/repos/gptq b/repos/gptq
index 954b3218..f8bc2886 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 954b32183adda2acd437a3ab0683a28ca3c7e4c9
+Subproject commit f8bc2886cb2e2aaa704ea02404c2ff3841eb6fcf

From e742083703ea8111379492c75e62f9dfffd54a28 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 2 Apr 2023 11:17:29 +0200
Subject: [PATCH 033/113] Fix multi-gpu-offloading

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index f8bc2886..971a5785 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit f8bc2886cb2e2aaa704ea02404c2ff3841eb6fcf
+Subproject commit 971a5785a356460f2073b0141da0c1e0b8fdcbf6

From c8d00b7a10fd48f31f9d3fc4f4010f5481c772d4 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 2 Apr 2023 18:36:31 +0200
Subject: [PATCH 034/113] Add CPU offloading support for GPT-NeoX, GPT-J and
 OPT

---
 aiserver.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 82992461..2365f58b 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3144,6 +3144,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                 if offload_4bit:
                     koboldai_vars.lazy_load = False
+                    print("4-bit CPU offloader active")
                 
                 # If we're using torch_lazy_loader, we need to get breakmodel config
                 # early so that it knows where to load the individual model tensors
@@ -3176,10 +3177,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
                             if koboldai_vars.model_type == "gptj":
-                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                if offload_4bit:
+                                    model = load_quant_offload(gptj_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                                else:
+                                    model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "gpt_neox":
-                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                if offload_4bit:
+                                    model = load_quant_offload(gptneox_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                                else:
+                                    model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "llama":
                                 if offload_4bit:
@@ -3188,7 +3195,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                     model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "opt":
-                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                if offload_4bit:
+                                    model = load_quant_offload(opt_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                                else:
+                                    model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")

From ec4177a6d6cf3549f3aebffc1a54b4799c506657 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 3 Apr 2023 06:50:36 +0200
Subject: [PATCH 035/113] Remove cudatoolkit-dev and gcc/gxx 9 from conda env
 because they didn't resolve on Windows

---
 environments/huggingface.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 71d26e9c..b48c2547 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -11,9 +11,6 @@ dependencies:
   - pytorch=1.11.*
   - python=3.8.*
   - cudatoolkit=11.1
-  - cudatoolkit-dev=11.1
-  - gcc=9.*
-  - gxx=9.*
   - eventlet=0.33.3
   - dnspython=2.2.1
   - markdown

From b9df9b6f590388a8fc6139e25b1d1c24c21fac52 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 3 Apr 2023 20:27:17 +0200
Subject: [PATCH 036/113] Improve CPU offloading speed significantly when
 offloading less than half of the layers

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index 971a5785..e2f567e9 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 971a5785a356460f2073b0141da0c1e0b8fdcbf6
+Subproject commit e2f567e9332c2d92f1c26ea0f7c935fe7cd65655

From ce6761e74436298424d3ea7bb964bb243e8cd88a Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 4 Apr 2023 07:46:53 +0200
Subject: [PATCH 037/113] Fix issue causing expected scalar type Float but
 found Half RuntimeErrors

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index e2f567e9..08c5054d 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit e2f567e9332c2d92f1c26ea0f7c935fe7cd65655
+Subproject commit 08c5054d45b8c6277e74a35841570dc7b8cbc608

From 8b4375307c2e4ea1154125fea1e00ef8c1b38415 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 5 Apr 2023 21:10:40 +0200
Subject: [PATCH 038/113] Update file formatting section in guide

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c6e922aa..0296e876 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ If you haven't already done so, create a model folder with the same name as your
 
 Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
 
-Then move your model folder to KoboldAI/models, and rename the .pt in your model folder to 4bit.pt
+Then move your model folder to KoboldAI/models, and rename the .pt/safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
 
 So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
 

From 40092cc9faed0d225391699e4cada1b9fb043dff Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 5 Apr 2023 21:49:13 +0200
Subject: [PATCH 039/113] Improve guide formatting

---
 README.md | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 0296e876..e103bbff 100644
--- a/README.md
+++ b/README.md
@@ -5,48 +5,46 @@
 
 In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
 
-git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules
+`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
 
-cd KoboldAI
+`cd KoboldAI`
 
-Next step, subfolder mode or B: option doesn't matter choose either
+Next step, (Windows) subfolder mode or B: option doesn't matter choose either
 
-[if on Windows] install_requirements.bat if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
+[if on Windows] `install_requirements.bat` if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
 
-[if on Linux] install_requirements.sh
+[if on Linux] `install_requirements.sh`
 
 
-[if on Windows] run commandline.bat
+[if on Windows] run `commandline.bat`
 
-[if on Linux] run commandline.sh
+[if on Linux] run `commandline.sh`
 
-commandline.bat/commandline.sh will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt).
+`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt).
 
 
-cd repos
+`cd repos`
 
-cd gptq
+`cd gptq`
 
 
-[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] python setup_cuda.py install
+[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] `python setup_cuda.py install`
 
-[if on Linux] python setup_cuda.py install
+[if on Linux] `python setup_cuda.py install`
 
 After the Cuda kernel is compiled, return to KoboldAI base directory
 
-[if on Windows (only applies to windows users)] pip install flask_cors
-
 If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
 
-Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
+Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
 
-Then move your model folder to KoboldAI/models, and rename the .pt/safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
+Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
 
 So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
 
 If you haven't done so already, exit the command prompt/leave KAI's (base) venv
 
-Run play.bat [windows] or play.sh [linux]
+Run `play.bat` [windows] or `play.sh` [linux]
 
 Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on.
 

From 636c4e5a5284fa2a11af7aba2fdf55426047eb0f Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 7 Apr 2023 11:48:57 +0200
Subject: [PATCH 040/113] Update gptq repo

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index 08c5054d..17c46a59 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 08c5054d45b8c6277e74a35841570dc7b8cbc608
+Subproject commit 17c46a59ff20da657e68f3267f853b0243b983bc

From 7efd314428e0ad24b33fc9cd9ac19b45c6754e7b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 7 Apr 2023 20:10:24 +0200
Subject: [PATCH 041/113] Improve guide

---
 README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 86661df3..f9be9660 100644
--- a/README.md
+++ b/README.md
@@ -20,9 +20,11 @@ Next step, (Windows) subfolder mode or B: option doesn't matter choose either
 
 [if on Linux] run `commandline.sh`
 
-`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt).
-
+`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment.
+On Windows, this will show (base) at the start of the prompt line.
+If it shows (base) on Linux, you most likely have a base conda environment that you need to deactivate (`conda deactivate`)
 
+Then run
 `cd repos`
 
 `cd gptq`
@@ -42,7 +44,7 @@ Then move your model folder to KoboldAI/models, and rename the .pt or .safetenso
 
 So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
 
-If you haven't done so already, exit the command prompt/leave KAI's (base) venv
+If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
 
 Run `play.bat` [windows] or `play.sh` [linux]
 

From b628aec7194783da09035a3b8fe01f674df542ea Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 10 Apr 2023 22:37:16 +0200
Subject: [PATCH 042/113] Automatic installation of the quant_cuda module
 during install_requirements

Kepler (K40+) and Maxwell support
---
 install_requirements.bat | 4 ++++
 install_requirements.sh  | 3 +++
 repos/gptq               | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/install_requirements.bat b/install_requirements.bat
index 2a4534c1..05264259 100644
--- a/install_requirements.bat
+++ b/install_requirements.bat
@@ -49,6 +49,8 @@ umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\h
 umamba.exe -r B:\ clean -a -y
 rd B:\Python\pkgs /S /Q
 subst B: /d
+call B:\python\condabin\activate
+cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 pause
 exit
 
@@ -60,5 +62,7 @@ umamba.exe create -r miniconda3\ -n base
 umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy
 umamba.exe clean -a -y
 rd miniconda3\Python\pkgs /S /Q
+call miniconda3\condabin\activate
+cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 pause
 exit
diff --git a/install_requirements.sh b/install_requirements.sh
index 6f0e0dfd..7b5a8d5b 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -5,6 +5,9 @@ wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
 # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
+
+# Install quant_cuda module for 4-bit
+bin/micromamba run -r runtime -n koboldai pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl
 exit
 fi
 if [[ $1 = "rocm" ]]; then
diff --git a/repos/gptq b/repos/gptq
index 17c46a59..50b22e2b 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 17c46a59ff20da657e68f3267f853b0243b983bc
+Subproject commit 50b22e2ba8ec0f5cf0dca719392a2ec5254e7228

From 687d107d20345a0cc46bb069914d0ce6a3bcf43d Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 10 Apr 2023 22:46:12 +0200
Subject: [PATCH 043/113] Update README, remove steps that are no longer
 required

---
 README.md | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/README.md b/README.md
index f9be9660..0657fa0b 100644
--- a/README.md
+++ b/README.md
@@ -15,27 +15,6 @@ Next step, (Windows) subfolder mode or B: option doesn't matter choose either
 
 [if on Linux] `install_requirements.sh`
 
-
-[if on Windows] run `commandline.bat`
-
-[if on Linux] run `commandline.sh`
-
-`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment.
-On Windows, this will show (base) at the start of the prompt line.
-If it shows (base) on Linux, you most likely have a base conda environment that you need to deactivate (`conda deactivate`)
-
-Then run
-`cd repos`
-
-`cd gptq`
-
-
-[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] `python setup_cuda.py install`
-
-[if on Linux] `python setup_cuda.py install`
-
-After the Cuda kernel is compiled, return to KoboldAI base directory
-
 If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
 
 Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).

From 35f908e147fcac121bdafaf7ca4b751d8091f480 Mon Sep 17 00:00:00 2001
From: biscober <50845461+biscober@users.noreply.github.com>
Date: Tue, 11 Apr 2023 02:37:48 +0000
Subject: [PATCH 044/113] Update install_requirements.bat (#7)

* Update install_requirements.bat

move command to dismount temp B drive to after pip install command which requires B drive to still be mounted

* Update install_requirements.bat

cmd /k not necessary

* Update install_requirements.bat

add quotes (probably not required but w/e)
---
 install_requirements.bat | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/install_requirements.bat b/install_requirements.bat
index 05264259..3b735ddf 100644
--- a/install_requirements.bat
+++ b/install_requirements.bat
@@ -48,9 +48,9 @@ umamba.exe create -r B:\python\ -n base
 umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy
 umamba.exe -r B:\ clean -a -y
 rd B:\Python\pkgs /S /Q
-subst B: /d
 call B:\python\condabin\activate
-cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
+pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
+subst B: /d
 pause
 exit
 
@@ -63,6 +63,6 @@ umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingf
 umamba.exe clean -a -y
 rd miniconda3\Python\pkgs /S /Q
 call miniconda3\condabin\activate
-cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
+pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 pause
 exit

From 4d34f9b7de03c6843e05cf5e11864d6b180a07b5 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 16 Apr 2023 14:20:13 +0200
Subject: [PATCH 045/113] Move 4-bit loading code to separate inference_model
 file

---
 aiserver.py                                |  91 ++---
 modeling/inference_models/hf_torch_4bit.py | 385 +++++++++++++++++++++
 2 files changed, 412 insertions(+), 64 deletions(-)
 create mode 100644 modeling/inference_models/hf_torch_4bit.py

diff --git a/aiserver.py b/aiserver.py
index 7e9241f5..0a98d16f 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1776,56 +1776,6 @@ def unload_model():
         
     #Reload our badwords
     koboldai_vars.badwordsids = koboldai_settings.badwordsids_default
-
-
-def prepare_4bit_load(modelpath):
-    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
-    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
-    result = False
-    groupsize = -1
-    for p in paths_4bit:
-        p = os.path.join(modelpath, p)
-        val = [v for v in glob.glob(p) if "4bit-old" not in v]
-        if val:
-            result = val[0]
-            fname = Path(result).parts[-1]
-            g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname)
-            if g:
-                groupsize = int(g[0])
-            break
-
-    global monkey_patched_4bit
-
-    # Monkey-patch in old-format pt-file support
-    if not result:
-        print("4-bit file not found, falling back to old format.")
-        for p in paths_4bit_old:
-            p = os.path.join(modelpath, p)
-            if os.path.isfile(p):
-                result = p
-                break
-
-        if not result:
-            print("4-bit old-format file not found, loading failed.")
-            raise RuntimeError(f"4-bit load failed. PT-File not found.")
-
-        import llama, opt, gptneox, gptj, old_quant
-        llama.make_quant = old_quant.old_make_quant
-        opt.make_quant = old_quant.old_make_quant
-        gptneox.make_quant = old_quant.old_make_quant
-        gptj.make_quant = old_quant.old_make_quant
-        monkey_patched_4bit = True
-    elif monkey_patched_4bit:
-        # Undo monkey patch
-        print("Undoing 4-bit old format monkey patch")
-        import llama, opt, gptneox, gptj, quant
-        llama.make_quant = quant.make_quant
-        opt.make_quant = quant.make_quant
-        gptneox.make_quant = quant.make_quant
-        gptj.make_quant = quant.make_quant
-        monkey_patched_4bit = False
-
-    return result, groupsize
     
     
 def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
@@ -2008,9 +1958,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
             except:
                 pass
 
-        try:
-            from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
-            model = GenericHFTorchInferenceModel(
+        if use_4_bit:
+            from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel
+            model = HFTorch4BitInferenceModel(
                 koboldai_vars.model,
                 lazy_load=koboldai_vars.lazy_load,
                 low_mem=args.lowmem
@@ -2020,18 +1970,31 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 save_model=not (args.colab or args.cacheonly) or args.savemodel,
                 initial_load=initial_load,
             )
-        except SuperLegacyModelError:
-            from modeling.inference_models.legacy_gpt2_hf import CustomGPT2HFTorchInferenceModel
-            model = CustomGPT2HFTorchInferenceModel(
-                koboldai_vars.model,
-                lazy_load=koboldai_vars.lazy_load,
-                low_mem=args.lowmem
-            )
+        else:
+            try:
+                from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
+                model = GenericHFTorchInferenceModel(
+                    koboldai_vars.model,
+                    lazy_load=koboldai_vars.lazy_load,
+                    low_mem=args.lowmem
+                )
 
-            model.load(
-                save_model=not (args.colab or args.cacheonly) or args.savemodel,
-                initial_load=initial_load,
-            )
+                model.load(
+                    save_model=not (args.colab or args.cacheonly) or args.savemodel,
+                    initial_load=initial_load,
+                )
+            except SuperLegacyModelError:
+                from modeling.inference_models.legacy_gpt2_hf import CustomGPT2HFTorchInferenceModel
+                model = CustomGPT2HFTorchInferenceModel(
+                    koboldai_vars.model,
+                    lazy_load=koboldai_vars.lazy_load,
+                    low_mem=args.lowmem
+                )
+
+                model.load(
+                    save_model=not (args.colab or args.cacheonly) or args.savemodel,
+                    initial_load=initial_load,
+                )
 
         logger.info(f"Pipeline created: {koboldai_vars.model}")
     else:
diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
new file mode 100644
index 00000000..21f4ebfe
--- /dev/null
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -0,0 +1,385 @@
+from __future__ import annotations
+
+import os
+import json
+import torch
+import re
+import shutil
+import sys
+from typing import Union
+
+from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
+from modeling.inference_model import SuperLegacyModelError
+
+import utils
+import modeling.lazy_loader as lazy_loader
+import koboldai_settings
+from logger import logger, set_logger_verbosity, quiesce_logger
+
+try:
+    import breakmodel
+except ModuleNotFoundError as e:
+    # Breakmodel is only expected to work on GPU
+    if not utils.koboldai_vars.use_colab_tpu:
+        raise e
+
+from modeling.inference_models.hf_torch import HFTorchInferenceModel
+
+# 4-bit dependencies
+from pathlib import Path
+import glob
+sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
+from gptj import load_quant as gptj_load_quant
+from gptneox import load_quant as gptneox_load_quant
+from llama import load_quant as llama_load_quant
+from opt import load_quant as opt_load_quant
+from offload import load_quant_offload
+monkey_patched_4bit = False
+
+
+def prepare_4bit_load(modelpath):
+    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
+    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
+    result = False
+    groupsize = -1
+    for p in paths_4bit:
+        p = os.path.join(modelpath, p)
+        val = [v for v in glob.glob(p) if "4bit-old" not in v]
+        if val:
+            result = val[0]
+            fname = Path(result).parts[-1]
+            g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname)
+            if g:
+                groupsize = int(g[0])
+            break
+
+    global monkey_patched_4bit
+
+    # Monkey-patch in old-format pt-file support
+    if not result:
+        print("4-bit file not found, falling back to old format.")
+        for p in paths_4bit_old:
+            p = os.path.join(modelpath, p)
+            if os.path.isfile(p):
+                result = p
+                break
+
+        if not result:
+            print("4-bit old-format file not found, loading failed.")
+            raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
+
+        import llama, opt, gptneox, gptj, old_quant
+        llama.make_quant = old_quant.old_make_quant
+        opt.make_quant = old_quant.old_make_quant
+        gptneox.make_quant = old_quant.old_make_quant
+        gptj.make_quant = old_quant.old_make_quant
+        monkey_patched_4bit = True
+    elif monkey_patched_4bit:
+        # Undo monkey patch
+        print("Undoing 4-bit old format monkey patch")
+        import llama, opt, gptneox, gptj, quant
+        llama.make_quant = quant.make_quant
+        opt.make_quant = quant.make_quant
+        gptneox.make_quant = quant.make_quant
+        gptj.make_quant = quant.make_quant
+        monkey_patched_4bit = False
+
+    return result, groupsize
+
+
+class HFTorch4BitInferenceModel(HFTorchInferenceModel):
+    def _load(self, save_model: bool, initial_load: bool) -> None:
+        utils.koboldai_vars.allowsp = True
+
+        # Make model path the same as the model name to make this consistent
+        # with the other loading method if it isn't a known model type. This
+        # code is not just a workaround for below, it is also used to make the
+        # behavior consistent with other loading methods - Henk717
+        # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]:
+        #     utils.koboldai_vars.custmodpth = utils.koboldai_vars.model
+
+        if self.model_name == "NeoCustom":
+            self.model_name = os.path.basename(
+                os.path.normpath(utils.koboldai_vars.custmodpth)
+            )
+            utils.koboldai_vars.model = self.model_name
+
+        self.lazy_load = False
+
+        self.init_model_config()
+
+        gpulayers = utils.args.breakmodel_gpulayers
+
+        try:
+            gpu_layers_list = [int(l) for l in gpulayers.split(",")]
+        except ValueError:
+            gpu_layers_list = [utils.num_layers(self.model_config)]
+        self.offload_4bit = sum(gpu_layers_list) < utils.num_layers(self.model_config)
+
+        if self.offload_4bit:
+            utils.koboldai_vars.lazy_load = False
+            print("4-bit CPU offloader active")
+
+        tf_kwargs = {
+            "low_cpu_mem_usage": True,
+        }
+
+        # If we're using torch_lazy_loader, we need to get breakmodel config
+        # early so that it knows where to load the individual model tensors
+        if (
+            self.lazy_load
+            and utils.koboldai_vars.hascuda
+            and utils.koboldai_vars.breakmodel
+            and not utils.koboldai_vars.nobreakmodel
+        ):
+            self.breakmodel_device_config(self.model_config)
+
+        if self.lazy_load:
+            # If we're using lazy loader, we need to figure out what the model's hidden layers are called
+            with lazy_loader.use_lazy_load(
+                dematerialized_modules=True, use_accelerate_init_empty_weights=True
+            ):
+                try:
+                    metamodel = AutoModelForCausalLM.from_config(self.model_config)
+                except Exception as e:
+                    logger.error(f"Fell back to neo for metamodel due to {e}")
+                    try:
+                        metamodel = GPTNeoForCausalLM.from_config(self.model_config)
+                    except Exception as e:
+                        logger.error(f"Falling back again due to {e}")
+                        raise SuperLegacyModelError
+
+                utils.layers_module_names = utils.get_layers_module_names(metamodel)
+                utils.module_names = list(metamodel.state_dict().keys())
+                utils.named_buffers = list(metamodel.named_buffers(recurse=True))
+
+        # Download model from Huggingface if it does not exist, otherwise load locally
+        with self._maybe_use_float16(), lazy_loader.use_lazy_load(
+            enable=self.lazy_load,
+            callback=self._get_lazy_load_callback(utils.num_layers(self.model_config))
+            if self.lazy_load
+            else None,
+            dematerialized_modules=True,
+        ):
+            if self.lazy_load:
+                # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
+                tf_kwargs.pop("low_cpu_mem_usage", None)
+
+            if self.get_local_model_path():
+                # Model is stored locally, load it.
+                self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
+                self.tokenizer = self._get_tokenizer(self.get_local_model_path())
+            else:
+                # Model not stored locally, we need to download it.
+
+                # _rebuild_tensor patch for casting dtype and supporting LazyTensors
+                old_rebuild_tensor = torch._utils._rebuild_tensor
+
+                def new_rebuild_tensor(
+                    storage: Union[lazy_loader.LazyTensor, torch.Storage],
+                    storage_offset,
+                    shape,
+                    stride,
+                ):
+                    if not isinstance(storage, lazy_loader.LazyTensor):
+                        dtype = storage.dtype
+                    else:
+                        dtype = storage.storage_type.dtype
+                        if not isinstance(dtype, torch.dtype):
+                            dtype = storage.storage_type(0).dtype
+                    if dtype is torch.float32 and len(shape) >= 2:
+                        utils.koboldai_vars.fp32_model = True
+                    return old_rebuild_tensor(storage, storage_offset, shape, stride)
+
+                torch._utils._rebuild_tensor = new_rebuild_tensor
+                self.model = self._get_model(self.model_name, tf_kwargs)
+                self.tokenizer = self._get_tokenizer(self.model_name)
+                torch._utils._rebuild_tensor = old_rebuild_tensor
+
+                if save_model:
+                    self.tokenizer.save_pretrained(
+                        self.get_local_model_path(ignore_existance=True)
+                    )
+
+                    if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks:
+                        # Use save_pretrained to convert fp32 models to fp16,
+                        # unless we are using disk cache because save_pretrained
+                        # is not supported in that case
+                        self.model = self.model.half()
+                        self.model.save_pretrained(
+                            self.get_local_model_path(ignore_existance=True),
+                            max_shard_size="500MiB",
+                        )
+
+                    else:
+                        # For fp16 models, we can just copy the model files directly
+                        import transformers.configuration_utils
+                        import transformers.modeling_utils
+                        import transformers.file_utils
+                        import huggingface_hub
+
+                        # Save the config.json
+                        shutil.move(
+                            os.path.realpath(
+                                huggingface_hub.hf_hub_download(
+                                    self.model_name,
+                                    transformers.configuration_utils.CONFIG_NAME,
+                                    revision=utils.koboldai_vars.revision,
+                                    cache_dir="cache",
+                                    local_files_only=True,
+                                    legacy_cache_layout=False,
+                                )
+                            ),
+                            os.path.join(
+                                self.get_local_model_path(ignore_existance=True),
+                                transformers.configuration_utils.CONFIG_NAME,
+                            ),
+                        )
+
+                        if utils.num_shards is None:
+                            # Save the pytorch_model.bin or model.safetensors of an unsharded model
+                            any_success = False
+                            possible_checkpoint_names = [
+                                transformers.modeling_utils.WEIGHTS_NAME,
+                                "model.safetensors",
+                            ]
+
+                            for possible_checkpoint_name in possible_checkpoint_names:
+                                try:
+                                    shutil.move(
+                                        os.path.realpath(
+                                            huggingface_hub.hf_hub_download(
+                                                self.model_name,
+                                                possible_checkpoint_name,
+                                                revision=utils.koboldai_vars.revision,
+                                                cache_dir="cache",
+                                                local_files_only=True,
+                                                legacy_cache_layout=False,
+                                            )
+                                        ),
+                                        os.path.join(
+                                            self.get_local_model_path(
+                                                ignore_existance=True
+                                            ),
+                                            possible_checkpoint_name,
+                                        ),
+                                    )
+                                    any_success = True
+                                except Exception:
+                                    pass
+
+                            if not any_success:
+                                raise RuntimeError(f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'")
+                        else:
+                            # Handle saving sharded models
+
+                            with open(utils.from_pretrained_index_filename) as f:
+                                map_data = json.load(f)
+                            filenames = set(map_data["weight_map"].values())
+                            # Save the pytorch_model.bin.index.json of a sharded model
+                            shutil.move(
+                                os.path.realpath(utils.from_pretrained_index_filename),
+                                os.path.join(
+                                    self.get_local_model_path(ignore_existance=True),
+                                    transformers.modeling_utils.WEIGHTS_INDEX_NAME,
+                                ),
+                            )
+                            # Then save the pytorch_model-#####-of-#####.bin files
+                            for filename in filenames:
+                                shutil.move(
+                                    os.path.realpath(
+                                        huggingface_hub.hf_hub_download(
+                                            self.model_name,
+                                            filename,
+                                            revision=utils.koboldai_vars.revision,
+                                            cache_dir="cache",
+                                            local_files_only=True,
+                                            legacy_cache_layout=False,
+                                        )
+                                    ),
+                                    os.path.join(
+                                        self.get_local_model_path(
+                                            ignore_existance=True
+                                        ),
+                                        filename,
+                                    ),
+                                )
+                    shutil.rmtree("cache/")
+
+        if not self.lazy_load:
+            utils.layers_module_names = utils.get_layers_module_names(self.model)
+            utils.module_names = list(self.model.state_dict().keys())
+            utils.named_buffers = list(self.model.named_buffers(recurse=True))
+
+        if (
+            utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default
+            and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj")
+        ):
+            utils.koboldai_vars.badwordsids = [
+                [v]
+                for k, v in self.tokenizer.get_vocab().items()
+                if any(c in str(k) for c in "[]")
+            ]
+
+        self.patch_embedding()
+
+        if utils.koboldai_vars.hascuda:
+            if utils.koboldai_vars.usegpu:
+                # Use just VRAM
+                self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
+            elif utils.koboldai_vars.breakmodel:
+                # Use both RAM and VRAM (breakmodel)
+                if not self.lazy_load:
+                    self.breakmodel_device_config(self.model.config)
+                self._move_to_devices()
+            elif breakmodel.disk_blocks > 0:
+                # Use disk
+                self._move_to_devices()
+            else:
+                # Use CPU
+                self.model = self.model.to("cpu").float()
+        elif breakmodel.disk_blocks > 0:
+            self._move_to_devices()
+        else:
+            self.model = self.model.to("cpu").float()
+
+        self.model.kai_model = self
+        utils.koboldai_vars.modeldim = self.get_hidden_size()
+
+    def _get_model(self, location: str, tf_kwargs: Dict):
+        path_4bit, groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
+        print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
+
+        print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")
+        if utils.koboldai_vars.model_type == "gptj":
+            if self.offload_4bit:
+                model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+            else:
+                model = gptj_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+        elif utils.koboldai_vars.model_type == "gpt_neox":
+            if self.offload_4bit:
+                model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+            else:
+                model = gptneox_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+        elif utils.koboldai_vars.model_type == "llama":
+            if self.offload_4bit:
+                model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+            else:
+                model = llama_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+        elif utils.koboldai_vars.model_type == "opt":
+            if self.offload_4bit:
+                model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+            else:
+                model = opt_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+        else:
+            raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit")
+
+        return model.half()
+
+    def _get_tokenizer(self, location: str):
+        if utils.koboldai_vars.model_type == "llama":
+            tokenizer = LlamaTokenizer.from_pretrained(utils.koboldai_vars.custmodpth)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(utils.koboldai_vars.custmodpth)
+
+        return tokenizer

From ded5542d3a78be4d9c0e79486cd387f285acce42 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 16 Apr 2023 21:11:35 +0200
Subject: [PATCH 046/113] Fix error in 4bit offloading initialization code when
 running with --nobreakmodel

---
 aiserver.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index a7583d2c..913bea5c 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3171,7 +3171,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     lowmem = {}
                     koboldai_vars.lazy_load = False  # Also, lazy loader doesn't support GPT-2 models
 
-                gpu_layers_list = [int(l) for l in gpu_layers.split(",")]
+                try:
+                    gpu_layers_list = [int(l) for l in gpu_layers.split(",")]
+                except ValueError:
+                    gpu_layers_list = [utils.num_layers(model_config)]
                 offload_4bit = use_4_bit and sum(gpu_layers_list) < utils.num_layers(model_config)
 
                 if offload_4bit:

From 1ef515f4c22fc48241f0b825bb47004df17990f9 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 17 Apr 2023 07:21:18 +0200
Subject: [PATCH 047/113] Fix lazy-loading on 4-bit

---
 modeling/inference_models/hf_torch.py      | 19 +++++++++++--------
 modeling/inference_models/hf_torch_4bit.py |  2 --
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py
index a2b2ff80..53b02e6d 100644
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -412,14 +412,17 @@ class HFTorchInferenceModel(HFInferenceModel):
 
             @functools.lru_cache(maxsize=None)
             def get_original_key(key):
-                return max(
-                    (
-                        original_key
-                        for original_key in utils.module_names
-                        if original_key.endswith(key)
-                    ),
-                    key=len,
-                )
+                try:
+                    return max(
+                        (
+                            original_key
+                            for original_key in utils.module_names
+                            if original_key.endswith(key)
+                        ),
+                        key=len,
+                    )
+                except ValueError:
+                    return key
 
             for key, value in model_dict.items():
                 original_key = get_original_key(key)
diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 21f4ebfe..4b02d642 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -104,8 +104,6 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
             )
             utils.koboldai_vars.model = self.model_name
 
-        self.lazy_load = False
-
         self.init_model_config()
 
         gpulayers = utils.args.breakmodel_gpulayers

From 12699aa22950fe33912c82bf11ac8bc8a3487299 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 17 Apr 2023 07:26:03 +0200
Subject: [PATCH 048/113] Show 4-bit toggle without experimental ui

---
 aiserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 0a98d16f..21290f37 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1602,7 +1602,7 @@ def get_model_info(model, directory=""):
                          'break_values': break_values, 'gpu_count': gpu_count,
                          'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select,
                          'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False,
-                         'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False,
+                         'bit_4_available': koboldai_vars.bit_4_available,
                          'show_custom_model_box': show_custom_model_box})
     if send_horde_models:
         get_cluster_models({'key': key_value, 'url': default_url})

From 10c99a853c207c34d163914042b903d606dad8ee Mon Sep 17 00:00:00 2001
From: nerodiafasciata <mark.e.appleby@gmail.com>
Date: Wed, 12 Apr 2023 21:37:44 -0500
Subject: [PATCH 049/113] Added AMD instructions, added formatting

Added AMD install instructions
Formatted the install/run section for improved readability
---
 README.md | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0657fa0b..170c4f42 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
 ### Install/Use Guide
 (This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
 
+#### Installation
 In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
 
 `git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
@@ -11,10 +12,28 @@ In the command prompt/command line navigate to where you want the KoboldAI subfo
 
 Next step, (Windows) subfolder mode or B: option doesn't matter choose either
 
-[if on Windows] `install_requirements.bat` if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
+* [if on Windows]
+  ```
+  install_requirements.bat
+  ```
+  * if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
 
-[if on Linux] `install_requirements.sh`
+* [if on Linux with Nvidia] 
+  ```
+  ./install_requirements.sh
+  ```
+* [if on Linux with AMD]
+  ```
+  ./install_requirements.sh rocm
+  ./commandline-rocm.sh
+  cd repos/gptq
+  python setup_cuda.py install
+  ```
+  * If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed
+  * If you get CUDA_HOME envar is not set run in env: 
+    `pip3 install torch --index-url https://download.pytorch.org/whl/rocm5.4.2 --force-reinstall`
 
+#### Setting up models
 If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
 
 Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
@@ -23,9 +42,10 @@ Then move your model folder to KoboldAI/models, and rename the .pt or .safetenso
 
 So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
 
+#### Running KoboldAI and loading 4bit models
 If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
 
-Run `play.bat` [windows] or `play.sh` [linux]
+Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD]
 
 Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on.
 

From 934571857ba986202c895d93efe95a58fbcc6308 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 18 Apr 2023 22:52:54 +0200
Subject: [PATCH 050/113] Fix offloading

---
 modeling/inference_models/hf_torch_4bit.py | 37 ++++++----------------
 repos/gptq                                 |  2 +-
 2 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 4b02d642..be504d4f 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -109,10 +109,10 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
         gpulayers = utils.args.breakmodel_gpulayers
 
         try:
-            gpu_layers_list = [int(l) for l in gpulayers.split(",")]
+            self.gpu_layers_list = [int(l) for l in gpulayers.split(",")]
         except ValueError:
-            gpu_layers_list = [utils.num_layers(self.model_config)]
-        self.offload_4bit = sum(gpu_layers_list) < utils.num_layers(self.model_config)
+            self.gpu_layers_list = [utils.num_layers(self.model_config)]
+        self.offload_4bit = sum(self.gpu_layers_list) < utils.num_layers(self.model_config)
 
         if self.offload_4bit:
             utils.koboldai_vars.lazy_load = False
@@ -321,25 +321,8 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
 
         self.patch_embedding()
 
-        if utils.koboldai_vars.hascuda:
-            if utils.koboldai_vars.usegpu:
-                # Use just VRAM
-                self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
-            elif utils.koboldai_vars.breakmodel:
-                # Use both RAM and VRAM (breakmodel)
-                if not self.lazy_load:
-                    self.breakmodel_device_config(self.model.config)
-                self._move_to_devices()
-            elif breakmodel.disk_blocks > 0:
-                # Use disk
-                self._move_to_devices()
-            else:
-                # Use CPU
-                self.model = self.model.to("cpu").float()
-        elif breakmodel.disk_blocks > 0:
-            self._move_to_devices()
-        else:
-            self.model = self.model.to("cpu").float()
+        if not self.offload_4bit:
+            self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
 
         self.model.kai_model = self
         utils.koboldai_vars.modeldim = self.get_hidden_size()
@@ -351,28 +334,28 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
         print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")
         if utils.koboldai_vars.model_type == "gptj":
             if self.offload_4bit:
-                model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
             else:
                 model = gptj_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
         elif utils.koboldai_vars.model_type == "gpt_neox":
             if self.offload_4bit:
-                model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
             else:
                 model = gptneox_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
         elif utils.koboldai_vars.model_type == "llama":
             if self.offload_4bit:
-                model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
             else:
                 model = llama_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
         elif utils.koboldai_vars.model_type == "opt":
             if self.offload_4bit:
-                model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
             else:
                 model = opt_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
         else:
             raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit")
 
-        return model.half()
+        return model.half() if not self.offload_4bit else model
 
     def _get_tokenizer(self, location: str):
         if utils.koboldai_vars.model_type == "llama":
diff --git a/repos/gptq b/repos/gptq
index 50b22e2b..5d94e5fb 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 50b22e2ba8ec0f5cf0dca719392a2ec5254e7228
+Subproject commit 5d94e5fb2a03a432d9cbb0db95493ac33b0bfd71

From 8d61d6b04ab7c100db4871fb33c7b7eec835ccc4 Mon Sep 17 00:00:00 2001
From: nerodiafasciata <mark.e.appleby@gmail.com>
Date: Tue, 25 Apr 2023 00:25:28 -0500
Subject: [PATCH 051/113] install instruction update: don't run as admin (#12)

* Update README.md

Added note to tell windows users not to install as admin
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 170c4f42..67fe881a 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@
 #### Installation
 In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
 
+Note: do not run your command prompt as administrator/with elevated priviledges, reports suggest this leads to problems.
+
 `git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
 
 `cd KoboldAI`

From b58e5f353febf4c20f5ae2194b369f7e9160420a Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 25 Apr 2023 18:56:25 +0200
Subject: [PATCH 052/113] Add wheel links file for pip

---
 docs/gptq-whl-links.html | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 docs/gptq-whl-links.html

diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
new file mode 100644
index 00000000..c612b5e1
--- /dev/null
+++ b/docs/gptq-whl-links.html
@@ -0,0 +1,2 @@
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl">quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl">quant_cuda-0.0.0-cp38-cp38-win_amd64.whl</a>

From cd289a947824ad52daa9192363115b5322dbf749 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 25 Apr 2023 19:06:25 +0200
Subject: [PATCH 053/113] Use custom pip repo for wheels instead of modifying
 install_requirements scripts

---
 environments/huggingface.yml | 2 ++
 install_requirements.bat     | 4 ----
 install_requirements.sh      | 3 ---
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index daa25e1f..35580603 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -45,3 +45,5 @@ dependencies:
     - ftfy
     - pydub
     - diffusers
+    - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
+    - quant_cuda
diff --git a/install_requirements.bat b/install_requirements.bat
index 3b735ddf..2a4534c1 100644
--- a/install_requirements.bat
+++ b/install_requirements.bat
@@ -48,8 +48,6 @@ umamba.exe create -r B:\python\ -n base
 umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy
 umamba.exe -r B:\ clean -a -y
 rd B:\Python\pkgs /S /Q
-call B:\python\condabin\activate
-pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 subst B: /d
 pause
 exit
@@ -62,7 +60,5 @@ umamba.exe create -r miniconda3\ -n base
 umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy
 umamba.exe clean -a -y
 rd miniconda3\Python\pkgs /S /Q
-call miniconda3\condabin\activate
-pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 pause
 exit
diff --git a/install_requirements.sh b/install_requirements.sh
index 7b5a8d5b..6f0e0dfd 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -5,9 +5,6 @@ wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
 # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
-
-# Install quant_cuda module for 4-bit
-bin/micromamba run -r runtime -n koboldai pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl
 exit
 fi
 if [[ $1 = "rocm" ]]; then

From 99c4c3bae4956e7190beb6909a42d7debd033553 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 17 Apr 2023 07:26:03 +0200
Subject: [PATCH 054/113] Show 4-bit toggle without experimental ui

---
 aiserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 85523734..2fc8990c 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1809,7 +1809,7 @@ def get_model_info(model, directory=""):
                          'break_values': break_values, 'gpu_count': gpu_count,
                          'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select,
                          'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False,
-                         'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False,
+                         'bit_4_available': koboldai_vars.bit_4_available,
                          'show_custom_model_box': show_custom_model_box})
     if send_horde_models:
         get_cluster_models({'key': key_value, 'url': default_url})

From aedb6388c5f22b3bd99a0b8e17dc45d14c50e142 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Thu, 27 Apr 2023 07:05:11 +0200
Subject: [PATCH 055/113] Update README, remove experimental UI

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 67fe881a..aadfd345 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ If you haven't done so already, exit the command prompt/leave KAI's conda env. (
 
 Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD]
 
-Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on.
+Switch to UI2, then load your model and be sure 4-bit toggle is on.
 
 The 4bit toggle shows when a model to load is selected.
 

From 18ac5dfce6398a561c4521356f7187e6977a7c61 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Thu, 27 Apr 2023 16:04:30 +0200
Subject: [PATCH 056/113] Update to Pytorch 1.13.1 and CUDA 11.7

---
 docs/gptq-whl-links.html     | 4 ++--
 environments/huggingface.yml | 6 ++++--
 repos/gptq                   | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
index c612b5e1..710a43b8 100644
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -1,2 +1,2 @@
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl">quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl">quant_cuda-0.0.0-cp38-cp38-win_amd64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-27/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl">quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-27/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl">quant_cuda-0.0.0-cp38-cp38-win_amd64.whl</a>
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 35580603..b1b86c45 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -1,6 +1,7 @@
 name: koboldai
 channels:
   - pytorch
+  - nvidia
   - conda-forge
   - defaults
 dependencies:
@@ -8,9 +9,10 @@ dependencies:
   - flask-socketio=5.3.2
   - flask-session=0.4.0
   - python-socketio=5.7.2
-  - pytorch=1.11.*
+  - pytorch=1.13.1
+  - pytorch-cuda=11.7
   - python=3.8.*
-  - cudatoolkit=11.1
+  - cudatoolkit=11.7
   - eventlet=0.33.3
   - dnspython=2.2.1
   - markdown
diff --git a/repos/gptq b/repos/gptq
index 50b22e2b..3c16fd9c 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 50b22e2ba8ec0f5cf0dca719392a2ec5254e7228
+Subproject commit 3c16fd9c7946ebe85df8d951cb742adbc1966ec7

From 81f92ec402e07a07516a39859c98a616ecb47084 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 28 Apr 2023 10:55:22 +0200
Subject: [PATCH 057/113] Fix missing 4bit setting

---
 koboldai_settings.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/koboldai_settings.py b/koboldai_settings.py
index 1a4fcce6..7b7acac1 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -1296,6 +1296,8 @@ class system_settings(settings):
         self.keep_img_gen_in_memory = False
         self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost
         self.experimental_features = False
+        # Check if repos/gptq exists for 4-bit mode
+        self.bit_4_available = os.path.isdir("repos/gptq")
         self.seen_messages = []
         self.git_repository = ""
         self.git_branch = ""

From 852005fef484846c648cb6bdd9b0e2091e75e486 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 28 Apr 2023 18:32:34 +0200
Subject: [PATCH 058/113] Always use GPU offloader if splitting across GPUs,
 this increases speed considerably

---
 aiserver.py | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 5c2b7533..578a2cff 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3186,9 +3186,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     gpu_layers_list = [int(l) for l in gpu_layers.split(",")]
                 except ValueError:
                     gpu_layers_list = [utils.num_layers(model_config)]
-                offload_4bit = use_4_bit and sum(gpu_layers_list) < utils.num_layers(model_config)
 
-                if offload_4bit:
+                if use_4_bit:
                     koboldai_vars.lazy_load = False
                     print("4-bit CPU offloader active")
                 
@@ -3223,28 +3222,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
                             if koboldai_vars.model_type == "gptj":
-                                if offload_4bit:
-                                    model = load_quant_offload(gptj_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
-                                else:
-                                    model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                model = load_quant_offload(gptj_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "gpt_neox":
-                                if offload_4bit:
-                                    model = load_quant_offload(gptneox_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
-                                else:
-                                    model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                model = load_quant_offload(gptneox_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "llama":
-                                if offload_4bit:
-                                    model = load_quant_offload(llama_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
-                                else:
-                                    model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                model = load_quant_offload(llama_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "opt":
-                                if offload_4bit:
-                                    model = load_quant_offload(opt_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
-                                else:
-                                    model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                model = load_quant_offload(opt_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")
@@ -3352,7 +3339,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 patch_causallm(model)
 
                 if(koboldai_vars.hascuda):
-                    if offload_4bit:
+                    if use_4_bit:
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         generator = model.generate
                     elif(koboldai_vars.usegpu):

From 20a5587d660f651f108762ec99faf357a678285d Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 30 Apr 2023 18:17:43 +0200
Subject: [PATCH 059/113] Always use offloader script, because it speeds up
 multi gpu

---
 modeling/inference_models/hf_torch_4bit.py | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index be504d4f..98c9d785 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -333,25 +333,13 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
 
         print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")
         if utils.koboldai_vars.model_type == "gptj":
-            if self.offload_4bit:
-                model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
-            else:
-                model = gptj_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+            model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
         elif utils.koboldai_vars.model_type == "gpt_neox":
-            if self.offload_4bit:
-                model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
-            else:
-                model = gptneox_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+            model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
         elif utils.koboldai_vars.model_type == "llama":
-            if self.offload_4bit:
-                model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
-            else:
-                model = llama_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+            model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
         elif utils.koboldai_vars.model_type == "opt":
-            if self.offload_4bit:
-                model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
-            else:
-                model = opt_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+            model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
         else:
             raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit")
 

From aa67135d4280279fc50bc8223b582ec2fae38e11 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 30 Apr 2023 21:59:22 +0200
Subject: [PATCH 060/113] Implement new model format

Remove 4bit toggle
---
 aiserver.py                                | 24 ++++++++--
 koboldai_settings.py                       |  8 +++-
 modeling/inference_models/hf_torch_4bit.py | 55 +---------------------
 static/koboldai.js                         | 21 +--------
 4 files changed, 31 insertions(+), 77 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index ce5e3558..6b81eaf0 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1088,6 +1088,24 @@ def loadmodelsettings():
         if(not koboldai_vars.gamestarted):
             koboldai_vars.authornotetemplate = koboldai_vars.setauthornotetemplate
 
+    gptq_legacy_files = glob.glob(os.path.join(koboldai_vars.custmodpth, "4bit*.pt")) + glob.glob(os.path.join(koboldai_vars.custmodpth, "4bit*.safetensors"))
+    if "gptq_bits" in js:
+        koboldai_vars.gptq_model = True
+        koboldai_vars.gptq_bits = js["gptq_bits"]
+        koboldai_vars.gptq_groupsize = js.get("gptq_groupsize", -1)
+        safetensors_file = os.path.join(koboldai_vars.custmodpth, "model.safetensors")
+        pt_file = os.path.join(koboldai_vars.custmodpth, "model.ckpt")
+        koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file
+    elif gptq_legacy_files:
+        koboldai_vars.gptq_model = True
+        koboldai_vars.gptq_bits = 4
+        koboldai_vars.gptq_file = gptq_legacy_files[0]
+        fname = Path(koboldai_vars.gptq_file).parts[-1]
+        g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+        koboldai_vars.gptq_groupsize = int(g[0]) if g else -1
+    else:
+        koboldai_vars.gptq_model = False
+
 #==================================================================#
 #  Take settings from koboldai_vars and write them to client settings file
 #==================================================================#
@@ -1777,7 +1795,7 @@ def unload_model():
     koboldai_vars.badwordsids = koboldai_settings.badwordsids_default
     
     
-def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
+def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False):
     global model
     global tokenizer
     global model_config
@@ -1957,7 +1975,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
             except:
                 pass
 
-        if use_4_bit:
+        if koboldai_vars.gptq_model:
             from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel
             model = HFTorch4BitInferenceModel(
                 koboldai_vars.model,
@@ -6495,7 +6513,7 @@ def UI_2_load_model(data):
     koboldai_vars.model = data['model']
     koboldai_vars.custmodpth = data['path']
     print("loading Model")
-    load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'], use_4_bit=data['use_4_bit'])
+    load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'])
 
 #==================================================================#
 # Event triggered when load story is clicked
diff --git a/koboldai_settings.py b/koboldai_settings.py
index 56697573..c6560e32 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -919,7 +919,13 @@ class story_settings(settings):
         # In percent!!!
         self.commentary_chance = 0
         self.commentary_enabled = False
-        
+
+        # 4bit model vals
+        self.gptq_model = False
+        self.gptq_bits = -1
+        self.gptq_groupsize = -1
+        self.gptq_file = None
+
         self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled"))
 
         ################### must be at bottom #########################
diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 98c9d785..a0e89436 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -27,64 +27,12 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel
 
 # 4-bit dependencies
 from pathlib import Path
-import glob
 sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
 from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
 from opt import load_quant as opt_load_quant
 from offload import load_quant_offload
-monkey_patched_4bit = False
-
-
-def prepare_4bit_load(modelpath):
-    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
-    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
-    result = False
-    groupsize = -1
-    for p in paths_4bit:
-        p = os.path.join(modelpath, p)
-        val = [v for v in glob.glob(p) if "4bit-old" not in v]
-        if val:
-            result = val[0]
-            fname = Path(result).parts[-1]
-            g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname)
-            if g:
-                groupsize = int(g[0])
-            break
-
-    global monkey_patched_4bit
-
-    # Monkey-patch in old-format pt-file support
-    if not result:
-        print("4-bit file not found, falling back to old format.")
-        for p in paths_4bit_old:
-            p = os.path.join(modelpath, p)
-            if os.path.isfile(p):
-                result = p
-                break
-
-        if not result:
-            print("4-bit old-format file not found, loading failed.")
-            raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
-
-        import llama, opt, gptneox, gptj, old_quant
-        llama.make_quant = old_quant.old_make_quant
-        opt.make_quant = old_quant.old_make_quant
-        gptneox.make_quant = old_quant.old_make_quant
-        gptj.make_quant = old_quant.old_make_quant
-        monkey_patched_4bit = True
-    elif monkey_patched_4bit:
-        # Undo monkey patch
-        print("Undoing 4-bit old format monkey patch")
-        import llama, opt, gptneox, gptj, quant
-        llama.make_quant = quant.make_quant
-        opt.make_quant = quant.make_quant
-        gptneox.make_quant = quant.make_quant
-        gptj.make_quant = quant.make_quant
-        monkey_patched_4bit = False
-
-    return result, groupsize
 
 
 class HFTorch4BitInferenceModel(HFTorchInferenceModel):
@@ -328,7 +276,8 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
         utils.koboldai_vars.modeldim = self.get_hidden_size()
 
     def _get_model(self, location: str, tf_kwargs: Dict):
-        path_4bit, groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
+        path_4bit = utils.koboldai_vars.gptq_file
+        groupsize = utils.koboldai_vars.gptq_groupsize
         print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
 
         print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")
diff --git a/static/koboldai.js b/static/koboldai.js
index 89ee2ea1..cc31899f 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -1472,7 +1472,6 @@ function show_model_menu(data) {
 	document.getElementById("modelurl").classList.add("hidden");
 	document.getElementById("use_gpu_div").classList.add("hidden");
 	document.getElementById("use_8_bit_div").classList.add("hidden");
-	document.getElementById("use_4_bit_div").classList.add("hidden");
 	document.getElementById("modellayers").classList.add("hidden");
 	document.getElementById("oaimodel").classList.add("hidden");
 	var model_layer_bars = document.getElementById('model_layer_bars');
@@ -1646,14 +1645,6 @@ function selected_model_info(data) {
 		document.getElementById("use_8_bit_div").classList.add("hidden");
 		document.getElementById("use_8_bit").checked = false;
 	}
-	
-	//hide or unhide 4 bit mode
-	if (data.bit_4_available) {
-		document.getElementById("use_4_bit_div").classList.remove("hidden");
-	} else {
-		document.getElementById("use_4_bit_div").classList.add("hidden");
-		document.getElementById("use_4_bit").checked = false;
-	}
 
 	//default URL loading
 	if (data.default_url != null) {
@@ -1823,8 +1814,6 @@ function selected_model_info(data) {
 		accept.classList.remove("disabled");
 	}
 	accept.disabled = false;
-	
-	set_4_bit_mode(invert=false);
 }
 
 function update_gpu_layers() {
@@ -1885,8 +1874,7 @@ function load_model() {
 			   'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(), 
 			   'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value, 
 			   'online_model': selected_models,
-			   'use_8_bit': document.getElementById('use_8_bit').checked,
-			   'use_4_bit': document.getElementById('use_4_bit').checked};
+			   'use_8_bit': document.getElementById('use_8_bit').checked};
 	socket.emit("load_model", message);
 	closePopups();
 }
@@ -3170,13 +3158,6 @@ function save_preset() {
 	closePopups();
 }
 
-function set_4_bit_mode(invert=true) {
-	bit_4_status = document.getElementById("use_4_bit").checked;
-	if (invert) {
-		bit_4_status = !bit_4_status;
-	}
-}
-
 
 
 //--------------------------------------------General UI Functions------------------------------------

From 9c3d578d6c3449f951e97be06b67bc7b84eff0ba Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 2 May 2023 21:32:20 +0200
Subject: [PATCH 061/113] Work on model download support

---
 aiserver.py                                   | 32 +++----
 modeling/inference_models/generic_hf_torch.py |  5 +-
 modeling/inference_models/hf.py               |  8 ++
 modeling/inference_models/hf_torch_4bit.py    | 86 ++++++++++++++++---
 4 files changed, 98 insertions(+), 33 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 48e70854..81bb900f 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -50,6 +50,8 @@ import multiprocessing
 import numpy as np
 from collections import OrderedDict
 from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type
+import glob
+from pathlib import Path
 
 import requests
 import html
@@ -86,18 +88,6 @@ allowed_ips = set()  # empty set
 enable_whitelist = False
 
 
-# 4-bit dependencies
-from pathlib import Path
-import glob
-sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
-from gptj import load_quant as gptj_load_quant
-from gptneox import load_quant as gptneox_load_quant
-from llama import load_quant as llama_load_quant
-from opt import load_quant as opt_load_quant
-from offload import load_quant_offload
-monkey_patched_4bit = False
-
-
 if lupa.LUA_VERSION[:2] != (5, 4):
     logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
 
@@ -1974,6 +1964,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
             except:
                 pass
 
+        if not koboldai_vars.gptq_model:
+            # Run generic HF model load_config first to check what model it is
+            from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
+            model = GenericHFTorchInferenceModel(
+                koboldai_vars.model,
+                lazy_load=koboldai_vars.lazy_load,
+                low_mem=args.lowmem
+            )
+            model.load_config()
+
         if koboldai_vars.gptq_model:
             from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel
             model = HFTorch4BitInferenceModel(
@@ -1981,14 +1981,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 lazy_load=koboldai_vars.lazy_load,
                 low_mem=args.lowmem
             )
-        else:
-            from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
-            model = GenericHFTorchInferenceModel(
-                koboldai_vars.model,
-                lazy_load=koboldai_vars.lazy_load,
-                low_mem=args.lowmem
-            )
-
         model.load(
             save_model=not (args.colab or args.cacheonly) or args.savemodel,
             initial_load=initial_load,
diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py
index ce91b176..d45513aa 100644
--- a/modeling/inference_models/generic_hf_torch.py
+++ b/modeling/inference_models/generic_hf_torch.py
@@ -24,7 +24,7 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel
 
 
 class GenericHFTorchInferenceModel(HFTorchInferenceModel):
-    def _load(self, save_model: bool, initial_load: bool) -> None:
+    def load_config(self) -> None:
         utils.koboldai_vars.allowsp = True
 
         # Make model path the same as the model name to make this consistent
@@ -50,6 +50,9 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
 
         self.init_model_config()
 
+    def _load(self, save_model: bool, initial_load: bool) -> None:
+        self.load_config()
+
         tf_kwargs = {
             "low_cpu_mem_usage": True,
         }
diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index eae4bb2d..480da5d3 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -61,6 +61,14 @@ class HFInferenceModel(InferenceModel):
                 cache_dir="cache",
             )
             utils.koboldai_vars.model_type = self.model_config.model_type
+
+            if "gptq_bits" in dir(self.model_config):
+                utils.koboldai_vars.gptq_model = True
+                utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
+                utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize
+                utils.koboldai_vars.gptq_file = None
+            else:
+                utils.koboldai_vars.gptq_model = False
         except ValueError:
             utils.koboldai_vars.model_type = {
                 "NeoCustom": "gpt_neo",
diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index a0e89436..f0ff87b9 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import glob
 import json
 import torch
 import re
@@ -9,7 +10,6 @@ import sys
 from typing import Union
 
 from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
-from modeling.inference_model import SuperLegacyModelError
 
 import utils
 import modeling.lazy_loader as lazy_loader
@@ -33,6 +33,66 @@ from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
 from opt import load_quant as opt_load_quant
 from offload import load_quant_offload
+monkey_patched_4bit = False
+
+
+def prepare_4bit_load(modelpath):
+    path_4bit = os.path.join(modelpath, "model.safetensors")
+    if os.path.isfile(path_4bit):
+        return path_4bit, False
+
+    path_4bit = os.path.join(modelpath, "model.ckpt")
+    if os.path.isfile(path_4bit):
+        return path_4bit, False
+
+    # Legacy format support
+    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
+    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
+    result = False
+    groupsize = -1
+    for p in paths_4bit:
+        p = os.path.join(modelpath, p)
+        val = [v for v in glob.glob(p) if "4bit-old" not in v]
+        if val:
+            result = val[0]
+            fname = Path(result).parts[-1]
+            g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+            if g:
+                groupsize = int(g[0])
+            break
+
+    global monkey_patched_4bit
+
+    # Monkey-patch in old-format pt-file support
+    if not result:
+        print("4-bit file not found, falling back to old format.")
+        for p in paths_4bit_old:
+            p = os.path.join(modelpath, p)
+            if os.path.isfile(p):
+                result = p
+                break
+
+        if not result:
+            print("4-bit old-format file not found, loading failed.")
+            raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
+
+        import llama, opt, gptneox, gptj, old_quant
+        llama.make_quant = old_quant.old_make_quant
+        opt.make_quant = old_quant.old_make_quant
+        gptneox.make_quant = old_quant.old_make_quant
+        gptj.make_quant = old_quant.old_make_quant
+        monkey_patched_4bit = True
+    elif monkey_patched_4bit:
+        # Undo monkey patch
+        print("Undoing 4-bit old format monkey patch")
+        import llama, opt, gptneox, gptj, quant
+        llama.make_quant = quant.make_quant
+        opt.make_quant = quant.make_quant
+        gptneox.make_quant = quant.make_quant
+        gptj.make_quant = quant.make_quant
+        monkey_patched_4bit = False
+
+    return result, groupsize
 
 
 class HFTorch4BitInferenceModel(HFTorchInferenceModel):
@@ -87,17 +147,12 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
             ):
                 try:
                     metamodel = AutoModelForCausalLM.from_config(self.model_config)
+                    utils.layers_module_names = utils.get_layers_module_names(metamodel)
+                    utils.module_names = list(metamodel.state_dict().keys())
+                    utils.named_buffers = list(metamodel.named_buffers(recurse=True))
                 except Exception as e:
-                    logger.error(f"Fell back to neo for metamodel due to {e}")
-                    try:
-                        metamodel = GPTNeoForCausalLM.from_config(self.model_config)
-                    except Exception as e:
-                        logger.error(f"Falling back again due to {e}")
-                        raise SuperLegacyModelError
-
-                utils.layers_module_names = utils.get_layers_module_names(metamodel)
-                utils.module_names = list(metamodel.state_dict().keys())
-                utils.named_buffers = list(metamodel.named_buffers(recurse=True))
+                    logger.warning(f"Gave up on lazy loading due to {e}")
+                    self.lazy_load = False
 
         # Download model from Huggingface if it does not exist, otherwise load locally
         with self._maybe_use_float16(), lazy_loader.use_lazy_load(
@@ -276,8 +331,15 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
         utils.koboldai_vars.modeldim = self.get_hidden_size()
 
     def _get_model(self, location: str, tf_kwargs: Dict):
-        path_4bit = utils.koboldai_vars.gptq_file
+        if not utils.koboldai_vars.custmodpth:
+            pass
         groupsize = utils.koboldai_vars.gptq_groupsize
+
+        path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
+
+        if legacy_groupsize is not False:
+            groupsize = legacy_groupsize
+
         print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
 
         print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")

From dd6644aaf06813ceada9c0d7f669f1dfbcb38a09 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 2 May 2023 22:11:28 +0200
Subject: [PATCH 062/113] Pytorch 2.0 (#18)

* Update huggingface.yml to Pytorch 2.0 and CUDA 11.8

* Update github docs pip wheel hub

Update ROCm requirements

* Add rocm wheel
---
 docs/gptq-whl-links.html     | 5 +++--
 environments/huggingface.yml | 7 +++----
 environments/rocm.yml        | 8 +++++---
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
index 710a43b8..fed8b397 100644
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -1,2 +1,3 @@
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-27/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl">quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-27/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl">quant_cuda-0.0.0-cp38-cp38-win_amd64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-02/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl">quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-02/quant_rocm-0.0.0-cp38-cp38-linux_x86_64.whl">quant_rocm-0.0.0-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-02/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl">quant_cuda-0.0.0-cp38-cp38-win_amd64.whl</a>
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 8d5907ab..e5fb939c 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -10,10 +10,9 @@ dependencies:
   - flask-socketio=5.3.2
   - flask-session=0.4.0
   - python-socketio=5.7.2
-  - pytorch=1.13.1
-  - pytorch-cuda=11.7
+  - pytorch=2.0.0
+  - pytorch-cuda=11.8
   - python=3.8.*
-  - cudatoolkit=11.7
   - eventlet=0.33.3
   - dnspython=2.2.1
   - markdown
@@ -34,7 +33,7 @@ dependencies:
     - flask-cors
     - lupa==1.10
     - transformers==4.28.0
-    - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
+    - datasets
     - huggingface_hub==0.12.1
     - safetensors
     - accelerate==0.18.0
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 91b63dbd..9358575d 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -24,14 +24,14 @@ dependencies:
   - Pillow
   - psutil
   - pip:
-    - --extra-index-url https://download.pytorch.org/whl/rocm5.2
-    - torch==1.13.1+rocm5.2
+    - --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
+    - torch==2.0.0+rocm5.4.2
     - flask-cloudflared==0.0.10
     - flask-ngrok
     - flask-cors
     - lupa==1.10
     - transformers==4.28.0
-    - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
+    - datasets
     - huggingface_hub==0.12.1
     - safetensors
     - accelerate==0.18.0
@@ -42,3 +42,5 @@ dependencies:
     - ftfy
     - pydub
     - diffusers
+    - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
+    - quant_rocm

From d48fedcbfb7ba8dca9623215822ab1cbb700612e Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Thu, 4 May 2023 18:31:37 +0200
Subject: [PATCH 063/113] Fix llama 4-bit loading error

---
 modeling/inference_models/hf_torch_4bit.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index f0ff87b9..10ef0e56 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -14,7 +14,7 @@ from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer,
 import utils
 import modeling.lazy_loader as lazy_loader
 import koboldai_settings
-from logger import logger, set_logger_verbosity, quiesce_logger
+from logger import logger, set_logger_verbosity
 
 try:
     import breakmodel
@@ -24,6 +24,7 @@ except ModuleNotFoundError as e:
         raise e
 
 from modeling.inference_models.hf_torch import HFTorchInferenceModel
+from modeling.tokenizer import GenericTokenizer
 
 # 4-bit dependencies
 from pathlib import Path
@@ -362,4 +363,4 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
         else:
             tokenizer = AutoTokenizer.from_pretrained(utils.koboldai_vars.custmodpth)
 
-        return tokenizer
+        return GenericTokenizer(tokenizer)

From 4180620999307a8eefb2bcd05e94161eb478243b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Thu, 4 May 2023 19:52:56 +0200
Subject: [PATCH 064/113] Remove unnecessary changes, move gptq detection
 function to 4bit.py

---
 aiserver.py                                | 32 +---------------------
 modeling/inference_models/hf_torch_4bit.py | 32 ++++++++++++++++++++++
 static/koboldai.js                         |  8 +++---
 templates/popups.html                      |  4 ---
 4 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index e4c0c521..80518450 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -601,6 +601,7 @@ utils.socketio = socketio
 
 # Weird import position to steal koboldai_vars from utils
 from modeling.patches import patch_transformers
+from modeling.inference_models.hf_torch_4bit import load_model_gptq_settings
 
 
 old_socketio_on = socketio.on
@@ -1078,37 +1079,6 @@ def loadmodelsettings():
             koboldai_vars.authornotetemplate = koboldai_vars.setauthornotetemplate
 
 
-def load_model_gptq_settings():
-    try:
-        js   = json.loads(str(model.model_config).partition(' ')[2])
-    except Exception as e:
-        try:
-            try:
-                js   = json.load(open(koboldai_vars.custmodpth + "/config.json", "r"))
-            except Exception as e:
-                js   = json.load(open(koboldai_vars.custmodpth.replace('/', '_') + "/config.json", "r"))
-        except Exception as e:
-            js   = {}
-
-    gptq_legacy_files = glob.glob(os.path.join(koboldai_vars.custmodpth, "4bit*.pt")) + glob.glob(os.path.join(koboldai_vars.custmodpth, "4bit*.safetensors"))
-    if "gptq_bits" in js:
-        koboldai_vars.gptq_model = True
-        koboldai_vars.gptq_bits = js["gptq_bits"]
-        koboldai_vars.gptq_groupsize = js.get("gptq_groupsize", -1)
-        safetensors_file = os.path.join(koboldai_vars.custmodpth, "model.safetensors")
-        pt_file = os.path.join(koboldai_vars.custmodpth, "model.ckpt")
-        koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file
-    elif gptq_legacy_files:
-        koboldai_vars.gptq_model = True
-        koboldai_vars.gptq_bits = 4
-        koboldai_vars.gptq_file = gptq_legacy_files[0]
-        fname = Path(koboldai_vars.gptq_file).parts[-1]
-        g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
-        koboldai_vars.gptq_groupsize = int(g[0]) if g else -1
-    else:
-        koboldai_vars.gptq_model = False
-
-
 #==================================================================#
 #  Take settings from koboldai_vars and write them to client settings file
 #==================================================================#
diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 10ef0e56..5eb8d60c 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -96,6 +96,38 @@ def prepare_4bit_load(modelpath):
     return result, groupsize
 
 
+def load_model_gptq_settings():
+    try:
+        js   = json.loads(str(model.model_config).partition(' ')[2])
+    except Exception as e:
+        try:
+            try:
+                js = json.load(open(utils.koboldai_vars.custmodpth + "/config.json", "r"))
+            except Exception as e:
+                js = json.load(open(utils.koboldai_vars.custmodpth.replace('/', '_') + "/config.json", "r"))
+        except Exception as e:
+            utils.koboldai_vars.gptq_model = False
+            return
+
+    gptq_legacy_files = glob.glob(os.path.join(utils.koboldai_vars.custmodpth, "4bit*.pt")) + glob.glob(os.path.join(utils.koboldai_vars.custmodpth, "4bit*.safetensors"))
+    if "gptq_bits" in js:
+        utils.koboldai_vars.gptq_model = True
+        utils.koboldai_vars.gptq_bits = js["gptq_bits"]
+        utils.koboldai_vars.gptq_groupsize = js.get("gptq_groupsize", -1)
+        safetensors_file = os.path.join(utils.koboldai_vars.custmodpth, "model.safetensors")
+        pt_file = os.path.join(utils.koboldai_vars.custmodpth, "model.ckpt")
+        utils.koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file
+    elif gptq_legacy_files:
+        utils.koboldai_vars.gptq_model = True
+        utils.koboldai_vars.gptq_bits = 4
+        utils.koboldai_vars.gptq_file = gptq_legacy_files[0]
+        fname = Path(utils.koboldai_vars.gptq_file).parts[-1]
+        g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+        utils.koboldai_vars.gptq_groupsize = int(g[0]) if g else -1
+    else:
+        utils.koboldai_vars.gptq_model = False
+
+
 class HFTorch4BitInferenceModel(HFTorchInferenceModel):
     def _load(self, save_model: bool, initial_load: bool) -> None:
         utils.koboldai_vars.allowsp = True
diff --git a/static/koboldai.js b/static/koboldai.js
index 7918c3ff..cfc32d21 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -1680,7 +1680,7 @@ function selected_model_info(data) {
 		document.getElementById("use_8_bit_div").classList.add("hidden");
 		document.getElementById("use_8_bit").checked = false;
 	}
-
+	
 	//default URL loading
 	if (data.default_url != null) {
 		document.getElementById("modelurl").value = data.default_url;
@@ -1849,6 +1849,8 @@ function selected_model_info(data) {
 		accept.classList.remove("disabled");
 	}
 	accept.disabled = false;
+	
+	
 }
 
 function update_gpu_layers() {
@@ -3231,8 +3233,6 @@ function save_preset() {
 	closePopups();
 }
 
-
-
 //--------------------------------------------General UI Functions------------------------------------
 function put_cursor_at_element(element) {
 	var range = document.createRange();
@@ -7388,4 +7388,4 @@ $el("#gamescreen").addEventListener("paste", function(event) {
 		false,
 		event.clipboardData.getData("text/plain")
 	);
-});
+});
\ No newline at end of file
diff --git a/templates/popups.html b/templates/popups.html
index e53b6276..d3310e66 100644
--- a/templates/popups.html
+++ b/templates/popups.html
@@ -75,10 +75,6 @@
 				<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_8_bit" checked>
 				<div class="box-label">Use 8 bit mode</div>
 			</div>
-			<div class="box flex-push-right hidden" id=use_4_bit_div>
-				<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_4_bit" checked>
-				<div class="box-label">Use 4 bit mode</div>
-			</div>
 			<button type="button" class="btn popup_load_cancel_button action_button disabled" onclick="load_model()" id="btn_loadmodelaccept" disabled>Load</button>
 			<button type="button" class="btn popup_load_cancel_button" onclick='closePopups();' id="btn_loadmodelclose">Cancel</button>
 		</div>

From 43b0afc7a85d2ae61d478cd258a8015d177660b2 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 5 May 2023 20:07:10 +0200
Subject: [PATCH 065/113] Add safe MPT support

---
 .gitmodules                                   | 4 ++++
 aiserver.py                                   | 2 +-
 environments/huggingface.yml                  | 1 +
 environments/rocm.yml                         | 1 +
 modeling/inference_models/generic_hf_torch.py | 3 ++-
 modeling/inference_models/hf.py               | 2 +-
 modeling/inference_models/hf_torch.py         | 2 +-
 modeling/inference_models/hf_torch_4bit.py    | 3 ++-
 repos/__init__.py                             | 1 +
 repos/hf_bleeding_edge                        | 1 +
 utils.py                                      | 2 +-
 11 files changed, 16 insertions(+), 6 deletions(-)
 create mode 100644 repos/__init__.py
 create mode 160000 repos/hf_bleeding_edge

diff --git a/.gitmodules b/.gitmodules
index c6f4b308..4a1fb7c9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -8,3 +8,7 @@
 	path = repos/gptq
 	url = https://github.com/0cc4m/GPTQ-for-LLaMa
 	branch = a8303654c200c25577130466e5f9bc1e70fc8a50
+[submodule "repos/hf_bleeding_edge"]
+	path = repos/hf_bleeding_edge
+	url = https://github.com/0cc4m/hf_bleeding_edge
+	branch = b5d0b80c6947605b9ccf080fc17b68a516ea5857
diff --git a/aiserver.py b/aiserver.py
index 80518450..bb6cc171 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1621,7 +1621,7 @@ def get_layer_count(model, directory=""):
         else:
             if(directory):
                 model = directory
-            from transformers import AutoConfig
+            from repos.hf_bleeding_edge import AutoConfig
             if(os.path.isdir(model.replace('/', '_'))):
                 model_config = AutoConfig.from_pretrained(model.replace('/', '_'), revision=koboldai_vars.revision, cache_dir="cache")
             elif(is_model_downloaded(model)):
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index e5fb939c..a179c468 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -49,3 +49,4 @@ dependencies:
     - diffusers
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
     - quant_cuda
+    - einops
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 9358575d..d0daf4f2 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -44,3 +44,4 @@ dependencies:
     - diffusers
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
     - quant_rocm
+    - einops
diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py
index 9e30a7fd..61004db5 100644
--- a/modeling/inference_models/generic_hf_torch.py
+++ b/modeling/inference_models/generic_hf_torch.py
@@ -6,7 +6,8 @@ import torch
 import shutil
 from typing import Union
 
-from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel
+from transformers import GPTNeoForCausalLM, GPT2LMHeadModel
+from repos.hf_bleeding_edge import AutoModelForCausalLM
 
 import utils
 import modeling.lazy_loader as lazy_loader
diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index 99e55be4..8c797940 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -1,6 +1,6 @@
 import os
 from typing import Optional
-from transformers import AutoConfig
+from repos.hf_bleeding_edge import AutoConfig
 
 import utils
 import koboldai_settings
diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py
index 3cc28291..e0081c90 100644
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -19,9 +19,9 @@ from transformers import (
     StoppingCriteria,
     GPTNeoForCausalLM,
     GPT2LMHeadModel,
-    AutoModelForCausalLM,
     LogitsProcessorList,
 )
+from repos.hf_bleeding_edge import AutoModelForCausalLM
 
 import utils
 import modeling.lazy_loader as lazy_loader
diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 5eb8d60c..75fb9ddf 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -9,7 +9,8 @@ import shutil
 import sys
 from typing import Union
 
-from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
+from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
+from repos.hf_bleeding_edge import AutoModelForCausalLM
 
 import utils
 import modeling.lazy_loader as lazy_loader
diff --git a/repos/__init__.py b/repos/__init__.py
new file mode 100644
index 00000000..af438273
--- /dev/null
+++ b/repos/__init__.py
@@ -0,0 +1 @@
+from . import hf_bleeding_edge
diff --git a/repos/hf_bleeding_edge b/repos/hf_bleeding_edge
new file mode 160000
index 00000000..b5d0b80c
--- /dev/null
+++ b/repos/hf_bleeding_edge
@@ -0,0 +1 @@
+Subproject commit b5d0b80c6947605b9ccf080fc17b68a516ea5857
diff --git a/utils.py b/utils.py
index 13ebb6a3..89b9fb4f 100644
--- a/utils.py
+++ b/utils.py
@@ -184,7 +184,7 @@ def decodenewlines(txt):
 #  Returns number of layers given an HF model config
 #==================================================================#
 def num_layers(config):
-    return config["n_layer"] if isinstance(config, dict) else config.num_layers if hasattr(config, "num_layers") else config.n_layer if hasattr(config, "n_layer") else config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else None
+    return config["n_layer"] if isinstance(config, dict) else config.num_layers if hasattr(config, "num_layers") else config.n_layer if hasattr(config, "n_layer") else config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else config.n_layers if hasattr(config, "n_layers") else None
 
 #==================================================================#
 #  Downloads huggingface checkpoints using aria2c if possible

From dedf2afeb3df922f164892ff3144d6d110f0dc43 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Fri, 5 May 2023 19:50:56 +0200
Subject: [PATCH 066/113] More max_context_length flexibility

---
 aiserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index bb6cc171..791ae071 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -8302,7 +8302,7 @@ class GenerationInputSchema(SamplerSettingsSchema):
     use_userscripts: bool = fields.Boolean(load_default=False, metadata={"description": "Whether or not to use the userscripts from the KoboldAI GUI when generating text."})
     soft_prompt: Optional[str] = fields.String(metadata={"description": "Soft prompt to use when generating. If set to the empty string or any other string containing no non-whitespace characters, uses no soft prompt."}, validate=[soft_prompt_validator, validate.Regexp(r"^[^/\\]*$")])
     max_length: int = fields.Integer(validate=validate.Range(min=1, max=512), metadata={"description": "Number of tokens to generate."})
-    max_context_length: int = fields.Integer(validate=validate.Range(min=512, max=2048), metadata={"description": "Maximum number of tokens to send to the model."})
+    max_context_length: int = fields.Integer(validate=validate.Range(min=1), metadata={"description": "Maximum number of tokens to send to the model."})
     n: int = fields.Integer(validate=validate.Range(min=1, max=5), metadata={"description": "Number of outputs to generate."})
     disable_output_formatting: bool = fields.Boolean(load_default=True, metadata={"description": "When enabled, all output formatting options default to `false` instead of the value in the KoboldAI GUI."})
     frmttriminc: Optional[bool] = fields.Boolean(metadata={"description": "Output formatting option. When enabled, removes some characters from the end of the output such that the output doesn't end in the middle of a sentence. If the output is less than one sentence long, does nothing.\n\nIf `disable_output_formatting` is `true`, this defaults to `false` instead of the value in the KoboldAI GUI."})

From 2f7856f0d1e1d153256f884248fd37432ed57279 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 6 May 2023 20:52:42 +0200
Subject: [PATCH 067/113] Use GPTQ python module, add MPT quantized support

---
 aiserver.py                                   |  2 +-
 docs/gptq-whl-links.html                      |  6 +--
 environments/huggingface.yml                  |  3 +-
 environments/rocm.yml                         |  3 +-
 install_requirements.sh                       |  4 +-
 koboldai_settings.py                          |  7 +++-
 modeling/inference_models/generic_hf_torch.py |  2 +-
 modeling/inference_models/hf.py               |  2 +-
 modeling/inference_models/hf_torch.py         |  2 +-
 modeling/inference_models/hf_torch_4bit.py    | 38 ++++++-------------
 repos/__init__.py                             |  1 -
 repos/gptq                                    |  1 -
 repos/hf_bleeding_edge                        |  1 -
 13 files changed, 30 insertions(+), 42 deletions(-)
 delete mode 100644 repos/__init__.py
 delete mode 160000 repos/gptq
 delete mode 160000 repos/hf_bleeding_edge

diff --git a/aiserver.py b/aiserver.py
index 791ae071..11258fc1 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1621,7 +1621,7 @@ def get_layer_count(model, directory=""):
         else:
             if(directory):
                 model = directory
-            from repos.hf_bleeding_edge import AutoConfig
+            from hf_bleeding_edge import AutoConfig
             if(os.path.isdir(model.replace('/', '_'))):
                 model_config = AutoConfig.from_pretrained(model.replace('/', '_'), revision=koboldai_vars.revision, cache_dir="cache")
             elif(is_model_downloaded(model)):
diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
index fed8b397..427185db 100644
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -1,3 +1,3 @@
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-02/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl">quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-02/quant_rocm-0.0.0-cp38-cp38-linux_x86_64.whl">quant_rocm-0.0.0-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-02/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl">quant_cuda-0.0.0-cp38-cp38-win_amd64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq-0.0.1-cp38-cp38-linux_x86_64.whl">gptq-0.0.1-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_rocm-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_rocm-0.0.1-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq-0.0.1-cp38-cp38-win_amd64.whl">gptq-0.0.1-cp38-cp38-win_amd64.whl</a>
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index a179c468..f7fad2de 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -47,6 +47,7 @@ dependencies:
     - ftfy
     - pydub
     - diffusers
+    - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - quant_cuda
+    - gptq
     - einops
diff --git a/environments/rocm.yml b/environments/rocm.yml
index d0daf4f2..2b979d4c 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -42,6 +42,7 @@ dependencies:
     - ftfy
     - pydub
     - diffusers
+    - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - quant_rocm
+    - gptq_rocm
     - einops
diff --git a/install_requirements.sh b/install_requirements.sh
index 6e37c7e9..561b1b00 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -5,14 +5,14 @@ if [[ $1 = "cuda" || $1 = "CUDA" ]]; then
 wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
 # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
-bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
+# bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
 exit
 fi
 if [[ $1 = "rocm" || $1 = "ROCM" ]]; then
 wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
 bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y
 # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
-bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y
+# bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y
 exit
 fi
 echo Please specify either CUDA or ROCM
diff --git a/koboldai_settings.py b/koboldai_settings.py
index d278dcc4..3e0fc48a 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -1303,7 +1303,12 @@ class system_settings(settings):
         self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost
         self.experimental_features = False
         # Check if repos/gptq exists for 4-bit mode
-        self.bit_4_available = os.path.isdir("repos/gptq")
+        self.bit_4_available = True
+        try:
+            import gptq
+        except ImportError:
+            self.bit_4_available = False
+
         self.seen_messages = []
         self.git_repository = ""
         self.git_branch = ""
diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py
index 61004db5..78a4bf9f 100644
--- a/modeling/inference_models/generic_hf_torch.py
+++ b/modeling/inference_models/generic_hf_torch.py
@@ -7,7 +7,7 @@ import shutil
 from typing import Union
 
 from transformers import GPTNeoForCausalLM, GPT2LMHeadModel
-from repos.hf_bleeding_edge import AutoModelForCausalLM
+from hf_bleeding_edge import AutoModelForCausalLM
 
 import utils
 import modeling.lazy_loader as lazy_loader
diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index 8c797940..5ee2abaa 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -1,6 +1,6 @@
 import os
 from typing import Optional
-from repos.hf_bleeding_edge import AutoConfig
+from hf_bleeding_edge import AutoConfig
 
 import utils
 import koboldai_settings
diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py
index e0081c90..3339a75d 100644
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -21,7 +21,7 @@ from transformers import (
     GPT2LMHeadModel,
     LogitsProcessorList,
 )
-from repos.hf_bleeding_edge import AutoModelForCausalLM
+from hf_bleeding_edge import AutoModelForCausalLM
 
 import utils
 import modeling.lazy_loader as lazy_loader
diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 75fb9ddf..959d6258 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -10,7 +10,7 @@ import sys
 from typing import Union
 
 from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
-from repos.hf_bleeding_edge import AutoModelForCausalLM
+from hf_bleeding_edge import AutoModelForCausalLM
 
 import utils
 import modeling.lazy_loader as lazy_loader
@@ -28,14 +28,13 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel
 from modeling.tokenizer import GenericTokenizer
 
 # 4-bit dependencies
+import gptq
 from pathlib import Path
-sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
-from gptj import load_quant as gptj_load_quant
-from gptneox import load_quant as gptneox_load_quant
-from llama import load_quant as llama_load_quant
-from opt import load_quant as opt_load_quant
-from offload import load_quant_offload
-monkey_patched_4bit = False
+from gptq.gptj import load_quant as gptj_load_quant
+from gptq.gptneox import load_quant as gptneox_load_quant
+from gptq.llama import load_quant as llama_load_quant
+from gptq.opt import load_quant as opt_load_quant
+from gptq.offload import load_quant_offload
 
 
 def prepare_4bit_load(modelpath):
@@ -63,9 +62,6 @@ def prepare_4bit_load(modelpath):
                 groupsize = int(g[0])
             break
 
-    global monkey_patched_4bit
-
-    # Monkey-patch in old-format pt-file support
     if not result:
         print("4-bit file not found, falling back to old format.")
         for p in paths_4bit_old:
@@ -78,28 +74,16 @@ def prepare_4bit_load(modelpath):
             print("4-bit old-format file not found, loading failed.")
             raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
 
-        import llama, opt, gptneox, gptj, old_quant
-        llama.make_quant = old_quant.old_make_quant
-        opt.make_quant = old_quant.old_make_quant
-        gptneox.make_quant = old_quant.old_make_quant
-        gptj.make_quant = old_quant.old_make_quant
-        monkey_patched_4bit = True
-    elif monkey_patched_4bit:
-        # Undo monkey patch
-        print("Undoing 4-bit old format monkey patch")
-        import llama, opt, gptneox, gptj, quant
-        llama.make_quant = quant.make_quant
-        opt.make_quant = quant.make_quant
-        gptneox.make_quant = quant.make_quant
-        gptj.make_quant = quant.make_quant
-        monkey_patched_4bit = False
+        gptq.modelutils.set_gptq_version(0)
+    else:
+        gptq.modelutils.set_gptq_version(1)
 
     return result, groupsize
 
 
 def load_model_gptq_settings():
     try:
-        js   = json.loads(str(model.model_config).partition(' ')[2])
+        js = json.loads(str(model.model_config).partition(' ')[2])
     except Exception as e:
         try:
             try:
diff --git a/repos/__init__.py b/repos/__init__.py
deleted file mode 100644
index af438273..00000000
--- a/repos/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import hf_bleeding_edge
diff --git a/repos/gptq b/repos/gptq
deleted file mode 160000
index 3c16fd9c..00000000
--- a/repos/gptq
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 3c16fd9c7946ebe85df8d951cb742adbc1966ec7
diff --git a/repos/hf_bleeding_edge b/repos/hf_bleeding_edge
deleted file mode 160000
index b5d0b80c..00000000
--- a/repos/hf_bleeding_edge
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit b5d0b80c6947605b9ccf080fc17b68a516ea5857

From a9fa199c49ee8e903d609f2cab394a87b8a87d24 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 6 May 2023 21:30:33 +0200
Subject: [PATCH 068/113] Rename gptq module, pull fix

---
 docs/gptq-whl-links.html     | 6 +++---
 environments/huggingface.yml | 2 +-
 environments/rocm.yml        | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
index 427185db..08cd0cd7 100644
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -1,3 +1,3 @@
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq-0.0.1-cp38-cp38-linux_x86_64.whl">gptq-0.0.1-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_rocm-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_rocm-0.0.1-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq-0.0.1-cp38-cp38-win_amd64.whl">gptq-0.0.1-cp38-cp38-win_amd64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl</a>
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index f7fad2de..12978b39 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -49,5 +49,5 @@ dependencies:
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq
+    - gptq_koboldai==0.0.1
     - einops
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 2b979d4c..0cb44eb1 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -44,5 +44,5 @@ dependencies:
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_rocm
+    - gptq_koboldai_rocm==0.0.1
     - einops

From 9ec50c997280856dee810a74e18cd11fd5304228 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 6 May 2023 21:58:23 +0200
Subject: [PATCH 069/113] Fix 4-bit mpt

---
 modeling/inference_models/hf_torch_4bit.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 959d6258..8aaddcc1 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -34,6 +34,7 @@ from gptq.gptj import load_quant as gptj_load_quant
 from gptq.gptneox import load_quant as gptneox_load_quant
 from gptq.llama import load_quant as llama_load_quant
 from gptq.opt import load_quant as opt_load_quant
+from gptq.mpt import load_quant as mpt_load_quant
 from gptq.offload import load_quant_offload
 
 
@@ -369,6 +370,8 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
             model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
         elif utils.koboldai_vars.model_type == "opt":
             model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
+        elif utils.koboldai_vars.model_type == "mpt":
+            model = load_quant_offload(mpt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
         else:
             raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit")
 

From 51e6dcdcd4c1a69318f3818a7cb153f7221ad07f Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 7 May 2023 06:42:32 +0200
Subject: [PATCH 070/113] Revert accidental install_requirements change

---
 install_requirements.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/install_requirements.sh b/install_requirements.sh
index 561b1b00..6e37c7e9 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -5,14 +5,14 @@ if [[ $1 = "cuda" || $1 = "CUDA" ]]; then
 wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
 # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
-# bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
+bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
 exit
 fi
 if [[ $1 = "rocm" || $1 = "ROCM" ]]; then
 wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
 bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y
 # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
-# bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y
+bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y
 exit
 fi
 echo Please specify either CUDA or ROCM

From 6b4d3218d62a35623a42e775d93b09da26f1aabc Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 7 May 2023 06:55:51 +0200
Subject: [PATCH 071/113] Fix OOM when loading large model split across GPUs

---
 modeling/inference_models/hf_torch_4bit.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 8aaddcc1..350cd761 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -139,10 +139,8 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
             self.gpu_layers_list = [int(l) for l in gpulayers.split(",")]
         except ValueError:
             self.gpu_layers_list = [utils.num_layers(self.model_config)]
-        self.offload_4bit = sum(self.gpu_layers_list) < utils.num_layers(self.model_config)
 
-        if self.offload_4bit:
-            utils.koboldai_vars.lazy_load = False
+        if sum(self.gpu_layers_list) < utils.num_layers(self.model_config):
             print("4-bit CPU offloader active")
 
         tf_kwargs = {
@@ -343,9 +341,6 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
 
         self.patch_embedding()
 
-        if not self.offload_4bit:
-            self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
-
         self.model.kai_model = self
         utils.koboldai_vars.modeldim = self.get_hidden_size()
 
@@ -375,7 +370,7 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
         else:
             raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit")
 
-        return model.half() if not self.offload_4bit else model
+        return model
 
     def _get_tokenizer(self, location: str):
         if utils.koboldai_vars.model_type == "llama":

From e55a9d31c2e067ed42732dafddd6c67b696f3ceb Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 8 May 2023 22:55:59 +0200
Subject: [PATCH 072/113] Update readme, clean up gitmodules file

---
 .gitmodules | 8 --------
 README.md   | 6 ++----
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 4a1fb7c9..0107a8c3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,11 +4,3 @@
 [submodule "KoboldAI-Horde-Bridge"]
 	path = KoboldAI-Horde-Bridge
 	url = https://github.com/db0/KoboldAI-Horde-Bridge
-[submodule "repos/gptq"]
-	path = repos/gptq
-	url = https://github.com/0cc4m/GPTQ-for-LLaMa
-	branch = a8303654c200c25577130466e5f9bc1e70fc8a50
-[submodule "repos/hf_bleeding_edge"]
-	path = repos/hf_bleeding_edge
-	url = https://github.com/0cc4m/hf_bleeding_edge
-	branch = b5d0b80c6947605b9ccf080fc17b68a516ea5857
diff --git a/README.md b/README.md
index aadfd345..517c00e8 100644
--- a/README.md
+++ b/README.md
@@ -42,16 +42,14 @@ Put your 4bit quantized .pt or .safetensors in that folder with all associated .
 
 Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
 
-So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
+So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, it'll load the quantized model.
 
 #### Running KoboldAI and loading 4bit models
 If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
 
 Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD]
 
-Switch to UI2, then load your model and be sure 4-bit toggle is on.
-
-The 4bit toggle shows when a model to load is selected.
+Switch to UI2, then load your model.
 
 ## KoboldAI - Your gateway to GPT writing
 

From 4f94247910c1785b4fa15dc5eb81d664978a3f91 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 8 May 2023 22:56:17 +0200
Subject: [PATCH 073/113] Fix chat mode empty generation error

---
 utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils.py b/utils.py
index 89b9fb4f..54083339 100644
--- a/utils.py
+++ b/utils.py
@@ -714,7 +714,7 @@ def applyoutputformatting(txt, no_sentence_trimming=False, no_single_line=False)
         txt = replaceblanklines(txt)
 
     # trim off starting new lines in replies if we're in chat mode
-    if koboldai_vars.chatmode and txt[0] == "\n":
+    if koboldai_vars.chatmode and txt and txt[0] == "\n":
         txt = txt[1:]
 
     # Remove special characters

From 61215981424ce8abba076e687da0e60149b655ea Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 8 May 2023 22:57:09 +0200
Subject: [PATCH 074/113] Fix multigpu loading without lazy-loader

---
 modeling/inference_models/generic_hf_torch.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py
index 78a4bf9f..2772503b 100644
--- a/modeling/inference_models/generic_hf_torch.py
+++ b/modeling/inference_models/generic_hf_torch.py
@@ -243,6 +243,11 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
                                 )
                     shutil.rmtree("cache/")
 
+        if not self.lazy_load:
+            utils.layers_module_names = utils.get_layers_module_names(self.model)
+            utils.module_names = list(self.model.state_dict().keys())
+            utils.named_buffers = list(self.model.named_buffers(recurse=True))
+
         self.patch_embedding()
 
         if utils.koboldai_vars.hascuda:

From a2d01bb9e454a0c951fc9c4c3e67599bcf188b5b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 9 May 2023 22:19:18 +0200
Subject: [PATCH 075/113] Update to GPTQ module 0.0.2, add support for upstream
 cuda quantizations, automatic detection

---
 docs/gptq-whl-links.html                   |  4 ++
 environments/huggingface.yml               |  2 +-
 environments/rocm.yml                      |  6 +--
 koboldai_settings.py                       |  1 +
 modeling/inference_models/hf.py            |  3 +-
 modeling/inference_models/hf_torch_4bit.py | 59 +++++++++++++---------
 6 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
index 08cd0cd7..64d15d3d 100644
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -1,3 +1,7 @@
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl</a>
+
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl</a>
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 12978b39..c381ea94 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -49,5 +49,5 @@ dependencies:
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai==0.0.1
+    - gptq_koboldai==0.0.2
     - einops
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 0cb44eb1..4f6cfa11 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -24,8 +24,8 @@ dependencies:
   - Pillow
   - psutil
   - pip:
-    - --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
-    - torch==2.0.0+rocm5.4.2
+    - --extra-index-url https://download.pytorch.org/whl/rocm5.2
+    - torch==1.13.1+rocm5.2
     - flask-cloudflared==0.0.10
     - flask-ngrok
     - flask-cors
@@ -44,5 +44,5 @@ dependencies:
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai_rocm==0.0.1
+    - gptq_koboldai_rocm==0.0.2
     - einops
diff --git a/koboldai_settings.py b/koboldai_settings.py
index 3e0fc48a..f0df2162 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -925,6 +925,7 @@ class story_settings(settings):
         self.gptq_model = False
         self.gptq_bits = -1
         self.gptq_groupsize = -1
+        self.gptq_version = -1
         self.gptq_file = None
 
         self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled"))
diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index 5ee2abaa..7050f34e 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -181,7 +181,8 @@ class HFInferenceModel(InferenceModel):
             if "gptq_bits" in dir(self.model_config):
                 utils.koboldai_vars.gptq_model = True
                 utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
-                utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize
+                utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
+                utils.koboldai_vars.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
                 utils.koboldai_vars.gptq_file = None
             else:
                 utils.koboldai_vars.gptq_model = False
diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 350cd761..5917a43e 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -48,8 +48,7 @@ def prepare_4bit_load(modelpath):
         return path_4bit, False
 
     # Legacy format support
-    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
-    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
+    paths_4bit = ["4bit*.safetensors", "4bit*.pt", "4bit-old.safetensors", "4bit-old.pt"]
     result = False
     groupsize = -1
     for p in paths_4bit:
@@ -59,26 +58,11 @@ def prepare_4bit_load(modelpath):
             result = val[0]
             fname = Path(result).parts[-1]
             g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+            groupsize = -1
             if g:
                 groupsize = int(g[0])
             break
 
-    if not result:
-        print("4-bit file not found, falling back to old format.")
-        for p in paths_4bit_old:
-            p = os.path.join(modelpath, p)
-            if os.path.isfile(p):
-                result = p
-                break
-
-        if not result:
-            print("4-bit old-format file not found, loading failed.")
-            raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
-
-        gptq.modelutils.set_gptq_version(0)
-    else:
-        gptq.modelutils.set_gptq_version(1)
-
     return result, groupsize
 
 
@@ -103,6 +87,7 @@ def load_model_gptq_settings():
         safetensors_file = os.path.join(utils.koboldai_vars.custmodpth, "model.safetensors")
         pt_file = os.path.join(utils.koboldai_vars.custmodpth, "model.ckpt")
         utils.koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file
+        utils.koboldai_vars.gptq_version = js.get("gptq_version", -1)
     elif gptq_legacy_files:
         utils.koboldai_vars.gptq_model = True
         utils.koboldai_vars.gptq_bits = 4
@@ -110,10 +95,37 @@ def load_model_gptq_settings():
         fname = Path(utils.koboldai_vars.gptq_file).parts[-1]
         g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
         utils.koboldai_vars.gptq_groupsize = int(g[0]) if g else -1
+        utils.koboldai_vars.gptq_version = -1
     else:
         utils.koboldai_vars.gptq_model = False
 
 
+def get_gptq_version(fpath):
+    v1_strings = ["zeros", "scales", "bias", "qweight"]
+    v2_strings = ["qzeros", "scales", "bias", "qweight"]
+    v3_strings = ["qzeros", "scales", "g_idx", "qweight"]
+
+    with open(fpath, "rb") as f:
+        data = str(f.read(1024*1024))
+
+    v0 = all([s in data for s in v1_strings]) and not "qzeros" in data
+    v1 = all([s in data for s in v2_strings])
+    v2 = all([s in data for s in v3_strings])
+
+    if v2:
+        if v0 or v1:
+            logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}")
+        return 2
+    if v1:
+        if v0 or v2:
+            logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}")
+        return 1
+    if v0:
+        if v1 or v2:
+            logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
+        return 0
+
+
 class HFTorch4BitInferenceModel(HFTorchInferenceModel):
     def _load(self, save_model: bool, initial_load: bool) -> None:
         utils.koboldai_vars.allowsp = True
@@ -140,9 +152,6 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
         except ValueError:
             self.gpu_layers_list = [utils.num_layers(self.model_config)]
 
-        if sum(self.gpu_layers_list) < utils.num_layers(self.model_config):
-            print("4-bit CPU offloader active")
-
         tf_kwargs = {
             "low_cpu_mem_usage": True,
         }
@@ -351,12 +360,14 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
 
         path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
 
+        if utils.koboldai_vars.gptq_version < 0:
+            utils.koboldai_vars.gptq_version = get_gptq_version(path_4bit)
+        gptq.modelutils.set_gptq_version(utils.koboldai_vars.gptq_version)
+
         if legacy_groupsize is not False:
             groupsize = legacy_groupsize
 
-        print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
-
-        print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")
+        logger.info(f"Using 4-bit file: {path_4bit}, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}")
         if utils.koboldai_vars.model_type == "gptj":
             model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
         elif utils.koboldai_vars.model_type == "gpt_neox":

From 266c0574f671e3038b75ee1d396c761d095f3592 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 13 May 2023 20:15:11 +0200
Subject: [PATCH 076/113] Fix 4bit pt loading, add traceback output to GPT2
 fallback

---
 modeling/inference_models/hf_torch.py | 5 +++--
 modeling/lazy_loader.py               | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py
index 3339a75d..dfb9d5f9 100644
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -291,7 +291,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                 logger.error("Invalid load key! Aborting.")
                 raise
 
-            logger.warning(f"Fell back to GPT2LMHeadModel due to {e}")
+            logger.warning(f"Fell back to GPT2LMHeadModel due to {traceback.format_exc()}")
             try:
                 return GPT2LMHeadModel.from_pretrained(location, **tf_kwargs)
             except Exception as e:
@@ -538,7 +538,8 @@ class HFTorchInferenceModel(HFInferenceModel):
                                 try:
                                     f = z.open(f"archive/data/{storage_key}")
                                 except:
-                                    f = z.open(f"{zipfolder}/data/{storage_key}")
+                                    ziproot = z.namelist()[0].split("/")[0]
+                                    f = z.open(f"{ziproot}/data/{storage_key}")
                                 current_offset = 0
                             if current_offset != model_dict[key].seek_offset:
                                 f.read(model_dict[key].seek_offset - current_offset)
diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py
index 3dee5bae..14ece404 100644
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -141,7 +141,8 @@ class TorchLazyTensor(LazyTensor):
             try:
                 f = checkpoint.open(f"archive/data/{self.key}", "r")
             except:
-                f = checkpoint.open(f"{filename}/data/{self.key}", "r")
+                ziproot = z.namelist()[0].split("/")[0]
+                f = z.open(f"{ziproot}/data/{self.key}", "r")
             f.read(self.seek_offset)
         else:
             f = checkpoint

From 7f7b350741ebeb7e9157a240846740a845d077e6 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 13 May 2023 20:31:01 +0200
Subject: [PATCH 077/113] Catch further error during multigpu 4bit setup

---
 modeling/inference_models/hf_torch_4bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 5917a43e..2fd4cb89 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -149,7 +149,7 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
 
         try:
             self.gpu_layers_list = [int(l) for l in gpulayers.split(",")]
-        except ValueError:
+        except (ValueError, AttributeError):
             self.gpu_layers_list = [utils.num_layers(self.model_config)]
 
         tf_kwargs = {

From 3d4d5df76bfc2e6c832c3e8f174f77a23557cf02 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 13 May 2023 20:33:13 +0200
Subject: [PATCH 078/113] Remove rocm wheel, because it didn't work correctly

---
 README.md             | 3 +--
 environments/rocm.yml | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 517c00e8..5f4bc5c7 100644
--- a/README.md
+++ b/README.md
@@ -28,8 +28,7 @@ Next step, (Windows) subfolder mode or B: option doesn't matter choose either
   ```
   ./install_requirements.sh rocm
   ./commandline-rocm.sh
-  cd repos/gptq
-  python setup_cuda.py install
+  pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa@c884b421a233f9603d8224c9b22c2d83dd2c1fc4
   ```
   * If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed
   * If you get CUDA_HOME envar is not set run in env: 
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 4f6cfa11..4e53a821 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -43,6 +43,4 @@ dependencies:
     - pydub
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
-    - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai_rocm==0.0.2
     - einops

From 2c18d9f2b5dba9caad378f3ed04f84d408720e36 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Thu, 18 May 2023 21:51:03 +0200
Subject: [PATCH 079/113] Update GPTQ module to 0.0.3

---
 docs/gptq-whl-links.html     | 3 +++
 environments/huggingface.yml | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
index 64d15d3d..750c0746 100644
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -5,3 +5,6 @@
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl</a>
+
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-18-2/gptq_koboldai-0.0.3-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.3-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-18-2/gptq_koboldai-0.0.3-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.3-cp38-cp38-win_amd64.whl</a>
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index c381ea94..e4aac1ed 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -49,5 +49,5 @@ dependencies:
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai==0.0.2
+    - gptq_koboldai==0.0.3
     - einops

From d5eac13d9f76484d991e33e0cc3a487fc5119937 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 19 May 2023 18:22:26 +0200
Subject: [PATCH 080/113] Fix 2, 3 and 8-bit loading

---
 modeling/inference_models/hf_torch_4bit.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index 2fd4cb89..580fa306 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -367,17 +367,17 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
         if legacy_groupsize is not False:
             groupsize = legacy_groupsize
 
-        logger.info(f"Using 4-bit file: {path_4bit}, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}")
+        logger.info(f"Using GPTQ file: {path_4bit}, {utils.koboldai_vars.gptq_bits}-bit model, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}")
         if utils.koboldai_vars.model_type == "gptj":
-            model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
+            model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
         elif utils.koboldai_vars.model_type == "gpt_neox":
-            model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
+            model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
         elif utils.koboldai_vars.model_type == "llama":
-            model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
+            model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
         elif utils.koboldai_vars.model_type == "opt":
-            model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
+            model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
         elif utils.koboldai_vars.model_type == "mpt":
-            model = load_quant_offload(mpt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
+            model = load_quant_offload(mpt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
         else:
             raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit")
 

From c32932998dda6ababec9687e0d4970a6f0f70922 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 19 May 2023 21:51:38 +0200
Subject: [PATCH 081/113] Update GPTQ module to 0.0.4

---
 docs/gptq-whl-links.html     | 3 +++
 environments/huggingface.yml | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
index 750c0746..34d05691 100644
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -8,3 +8,6 @@
 
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-18-2/gptq_koboldai-0.0.3-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.3-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-18-2/gptq_koboldai-0.0.3-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.3-cp38-cp38-win_amd64.whl</a>
+
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl</a>
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index e4aac1ed..c7d03ad0 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -49,5 +49,5 @@ dependencies:
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai==0.0.3
+    - gptq_koboldai==0.0.4
     - einops

From e49d35afc935f3a52155a0bc9f9d200a84e1ad41 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 28 May 2023 22:54:36 +0200
Subject: [PATCH 082/113] Add 4bit plugin

---
 aiserver.py                                   |   1 -
 .../inference_models/4bit_hf_torch/class.py   | 227 ++++++++++
 .../generic_hf_torch/class.py                 |  13 +-
 modeling/inference_models/hf.py               |   4 -
 modeling/inference_models/hf_torch_4bit.py    | 392 ------------------
 5 files changed, 233 insertions(+), 404 deletions(-)
 create mode 100644 modeling/inference_models/4bit_hf_torch/class.py
 delete mode 100644 modeling/inference_models/hf_torch_4bit.py

diff --git a/aiserver.py b/aiserver.py
index c28633d6..3c574431 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -623,7 +623,6 @@ utils.socketio = socketio
 
 # Weird import position to steal koboldai_vars from utils
 from modeling.patches import patch_transformers
-from modeling.inference_models.hf_torch_4bit import load_model_gptq_settings
 
 #Load all of the model importers
 import importlib
diff --git a/modeling/inference_models/4bit_hf_torch/class.py b/modeling/inference_models/4bit_hf_torch/class.py
new file mode 100644
index 00000000..62f04bfb
--- /dev/null
+++ b/modeling/inference_models/4bit_hf_torch/class.py
@@ -0,0 +1,227 @@
+from __future__ import annotations
+
+import os
+import glob
+import json
+import torch
+import re
+import shutil
+import sys
+from typing import Union
+
+from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
+from hf_bleeding_edge import AutoModelForCausalLM
+
+import utils
+import modeling.lazy_loader as lazy_loader
+import koboldai_settings
+from logger import logger, set_logger_verbosity
+
+try:
+    import breakmodel
+except ModuleNotFoundError as e:
+    # Breakmodel is only expected to work on GPU
+    if not utils.koboldai_vars.use_colab_tpu:
+        raise e
+
+from modeling.inference_models.hf_torch import HFTorchInferenceModel
+from modeling.tokenizer import GenericTokenizer
+
+# 4-bit dependencies
+import gptq
+from pathlib import Path
+from gptq.gptj import load_quant as gptj_load_quant
+from gptq.gptneox import load_quant as gptneox_load_quant
+from gptq.llama import load_quant as llama_load_quant
+from gptq.opt import load_quant as opt_load_quant
+from gptq.mpt import load_quant as mpt_load_quant
+from gptq.offload import load_quant_offload
+
+
+model_backend_name = "Huggingface GPTQ"
+
+
+def load_model_gptq_settings(path):
+    try:
+        js = json.load(open(path + "/config.json", "r"))
+    except Exception as e:
+        return False, -1, -1, False, -1
+
+    gptq_model = False
+    gptq_bits = -1
+    gptq_groupsize = -1
+    gptq_file = False
+    gptq_version = -1
+
+    gptq_legacy_files = glob.glob(os.path.join(path, "4bit*.pt")) + glob.glob(os.path.join(path, "4bit*.safetensors"))
+    if "gptq_bits" in js:
+        gptq_model = True
+        gptq_bits = js["gptq_bits"]
+        gptq_groupsize = js.get("gptq_groupsize", -1)
+        safetensors_file = os.path.join(path, "model.safetensors")
+        pt_file = os.path.join(path, "model.ckpt")
+        gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file
+        gptq_version = js.get("gptq_version", -1)
+    elif gptq_legacy_files:
+        gptq_model = True
+        gptq_bits = 4
+        gptq_file = gptq_legacy_files[0]
+        fname = Path(gptq_file).parts[-1]
+        g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+        gptq_groupsize = int(g[0]) if g else -1
+        gptq_version = -1
+
+    return gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version
+
+
+def get_gptq_version(fpath):
+    v1_strings = ["zeros", "scales", "bias", "qweight"]
+    v2_strings = ["qzeros", "scales", "bias", "qweight"]
+    v3_strings = ["qzeros", "scales", "g_idx", "qweight"]
+
+    with open(fpath, "rb") as f:
+        data = str(f.read(1024*1024))
+
+    v0 = all([s in data for s in v1_strings]) and not "qzeros" in data
+    v1 = all([s in data for s in v2_strings])
+    v2 = all([s in data for s in v3_strings])
+
+    if v2:
+        if v0 or v1:
+            logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}")
+        return 2
+    if v1:
+        if v0 or v2:
+            logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}")
+        return 1
+    if v0:
+        if v1 or v2:
+            logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
+        return 0
+
+
+class model_backend(HFTorchInferenceModel):
+    def is_valid(self, model_name, model_path, menu_path):
+        gptq_model, _, _, _, _ = load_model_gptq_settings(model_path)
+        return gptq_model
+
+    def _load(self, save_model: bool, initial_load: bool) -> None:
+        # Make model path the same as the model name to make this consistent
+        # with the other loading method if it isn't a known model type. This
+        # code is not just a workaround for below, it is also used to make the
+        # behavior consistent with other loading methods - Henk717
+        # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]:
+        #     utils.koboldai_vars.custmodpth = utils.koboldai_vars.model
+
+        self.init_model_config()
+
+        self.lazy_load = False
+
+        gpulayers = breakmodel.gpu_blocks
+
+        try:
+            self.gpu_layers_list = [int(l) for l in gpulayers.split(",")]
+        except (ValueError, AttributeError):
+            self.gpu_layers_list = [utils.num_layers(self.model_config)]
+
+        tf_kwargs = {
+            "low_cpu_mem_usage": True,
+        }
+
+        # If we're using torch_lazy_loader, we need to get breakmodel config
+        # early so that it knows where to load the individual model tensors
+        logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
+        if (
+            self.lazy_load
+            and utils.koboldai_vars.hascuda
+            and utils.koboldai_vars.breakmodel
+            and not utils.koboldai_vars.nobreakmodel
+        ):
+            self.breakmodel_device_config(self.model_config)
+
+        if self.lazy_load:
+            # If we're using lazy loader, we need to figure out what the model's hidden layers are called
+            with lazy_loader.use_lazy_load(
+                dematerialized_modules=True, use_accelerate_init_empty_weights=True
+            ):
+                try:
+                    metamodel = AutoModelForCausalLM.from_config(self.model_config)
+                    utils.layers_module_names = utils.get_layers_module_names(metamodel)
+                    utils.module_names = list(metamodel.state_dict().keys())
+                    utils.named_buffers = list(metamodel.named_buffers(recurse=True))
+                except Exception as e:
+                    logger.warning(f"Gave up on lazy loading due to {e}")
+                    self.lazy_load = False
+
+        # Download model from Huggingface if it does not exist, otherwise load locally
+        with self._maybe_use_float16(), lazy_loader.use_lazy_load(
+            enable=self.lazy_load,
+            callback=self._get_lazy_load_callback(utils.num_layers(self.model_config))
+            if self.lazy_load
+            else None,
+            dematerialized_modules=True,
+        ):
+            if self.lazy_load:
+                # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
+                tf_kwargs.pop("low_cpu_mem_usage", None)
+
+            if self.get_local_model_path():
+                # Model is stored locally, load it.
+                self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
+                self.tokenizer = self._get_tokenizer(self.get_local_model_path())
+            else:
+                raise NotImplementedError("GPTQ Model downloading not implemented")
+
+        if not self.lazy_load:
+            utils.layers_module_names = utils.get_layers_module_names(self.model)
+            utils.module_names = list(self.model.state_dict().keys())
+            utils.named_buffers = list(self.model.named_buffers(recurse=True))
+
+        if (
+            utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default
+            and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj")
+        ):
+            utils.koboldai_vars.badwordsids = [
+                [v]
+                for k, v in self.tokenizer.get_vocab().items()
+                if any(c in str(k) for c in "[]")
+            ]
+
+        self.patch_embedding()
+
+        self.model.kai_model = self
+        utils.koboldai_vars.modeldim = self.get_hidden_size()
+
+    def _get_model(self, location: str, tf_kwargs: Dict):
+        gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
+
+        if gptq_version < 0:
+            gptq_version = get_gptq_version(gptq_file)
+        gptq.modelutils.set_gptq_version(gptq_version)
+
+        model_type = self.get_model_type()
+
+        logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}, groupsize {gptq_groupsize}")
+        if model_type == "gptj":
+            model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+        elif model_type == "gpt_neox":
+            model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+        elif model_type == "llama":
+            model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+        elif model_type == "opt":
+            model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+        elif model_type == "mpt":
+            model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+        else:
+            raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
+
+        return model
+
+    def _get_tokenizer(self, location: str):
+        model_type = self.get_model_type()
+        if model_type == "llama":
+            tokenizer = LlamaTokenizer.from_pretrained(location)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(location)
+
+        return GenericTokenizer(tokenizer)
diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py
index 50d7503c..93bc08ea 100644
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -25,8 +25,12 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel
 
 model_backend_name = "Huggingface"
 
-class GenericHFTorchInferenceModel(HFTorchInferenceModel):
-    def load_config(self) -> None:
+class model_backend(HFTorchInferenceModel):
+
+    def _initialize_model(self):
+        return
+
+    def _load(self, save_model: bool, initial_load: bool) -> None:
         utils.koboldai_vars.allowsp = True
 
         # Make model path the same as the model name to make this consistent
@@ -243,11 +247,6 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
                                 )
                     shutil.rmtree("cache/")
 
-        if not self.lazy_load:
-            utils.layers_module_names = utils.get_layers_module_names(self.model)
-            utils.module_names = list(self.model.state_dict().keys())
-            utils.named_buffers = list(self.model.named_buffers(recurse=True))
-
         self.patch_embedding()
 
         
diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index 93e1757a..dc34636a 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -1,12 +1,8 @@
 import os, sys
 from typing import Optional
-<<<<<<< HEAD
 from hf_bleeding_edge import AutoConfig
 
-=======
-from transformers import AutoConfig
 import warnings
->>>>>>> ebolam/Model_Plugins
 import utils
 import json
 import koboldai_settings
diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
deleted file mode 100644
index 580fa306..00000000
--- a/modeling/inference_models/hf_torch_4bit.py
+++ /dev/null
@@ -1,392 +0,0 @@
-from __future__ import annotations
-
-import os
-import glob
-import json
-import torch
-import re
-import shutil
-import sys
-from typing import Union
-
-from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
-from hf_bleeding_edge import AutoModelForCausalLM
-
-import utils
-import modeling.lazy_loader as lazy_loader
-import koboldai_settings
-from logger import logger, set_logger_verbosity
-
-try:
-    import breakmodel
-except ModuleNotFoundError as e:
-    # Breakmodel is only expected to work on GPU
-    if not utils.koboldai_vars.use_colab_tpu:
-        raise e
-
-from modeling.inference_models.hf_torch import HFTorchInferenceModel
-from modeling.tokenizer import GenericTokenizer
-
-# 4-bit dependencies
-import gptq
-from pathlib import Path
-from gptq.gptj import load_quant as gptj_load_quant
-from gptq.gptneox import load_quant as gptneox_load_quant
-from gptq.llama import load_quant as llama_load_quant
-from gptq.opt import load_quant as opt_load_quant
-from gptq.mpt import load_quant as mpt_load_quant
-from gptq.offload import load_quant_offload
-
-
-def prepare_4bit_load(modelpath):
-    path_4bit = os.path.join(modelpath, "model.safetensors")
-    if os.path.isfile(path_4bit):
-        return path_4bit, False
-
-    path_4bit = os.path.join(modelpath, "model.ckpt")
-    if os.path.isfile(path_4bit):
-        return path_4bit, False
-
-    # Legacy format support
-    paths_4bit = ["4bit*.safetensors", "4bit*.pt", "4bit-old.safetensors", "4bit-old.pt"]
-    result = False
-    groupsize = -1
-    for p in paths_4bit:
-        p = os.path.join(modelpath, p)
-        val = [v for v in glob.glob(p) if "4bit-old" not in v]
-        if val:
-            result = val[0]
-            fname = Path(result).parts[-1]
-            g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
-            groupsize = -1
-            if g:
-                groupsize = int(g[0])
-            break
-
-    return result, groupsize
-
-
-def load_model_gptq_settings():
-    try:
-        js = json.loads(str(model.model_config).partition(' ')[2])
-    except Exception as e:
-        try:
-            try:
-                js = json.load(open(utils.koboldai_vars.custmodpth + "/config.json", "r"))
-            except Exception as e:
-                js = json.load(open(utils.koboldai_vars.custmodpth.replace('/', '_') + "/config.json", "r"))
-        except Exception as e:
-            utils.koboldai_vars.gptq_model = False
-            return
-
-    gptq_legacy_files = glob.glob(os.path.join(utils.koboldai_vars.custmodpth, "4bit*.pt")) + glob.glob(os.path.join(utils.koboldai_vars.custmodpth, "4bit*.safetensors"))
-    if "gptq_bits" in js:
-        utils.koboldai_vars.gptq_model = True
-        utils.koboldai_vars.gptq_bits = js["gptq_bits"]
-        utils.koboldai_vars.gptq_groupsize = js.get("gptq_groupsize", -1)
-        safetensors_file = os.path.join(utils.koboldai_vars.custmodpth, "model.safetensors")
-        pt_file = os.path.join(utils.koboldai_vars.custmodpth, "model.ckpt")
-        utils.koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file
-        utils.koboldai_vars.gptq_version = js.get("gptq_version", -1)
-    elif gptq_legacy_files:
-        utils.koboldai_vars.gptq_model = True
-        utils.koboldai_vars.gptq_bits = 4
-        utils.koboldai_vars.gptq_file = gptq_legacy_files[0]
-        fname = Path(utils.koboldai_vars.gptq_file).parts[-1]
-        g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
-        utils.koboldai_vars.gptq_groupsize = int(g[0]) if g else -1
-        utils.koboldai_vars.gptq_version = -1
-    else:
-        utils.koboldai_vars.gptq_model = False
-
-
-def get_gptq_version(fpath):
-    v1_strings = ["zeros", "scales", "bias", "qweight"]
-    v2_strings = ["qzeros", "scales", "bias", "qweight"]
-    v3_strings = ["qzeros", "scales", "g_idx", "qweight"]
-
-    with open(fpath, "rb") as f:
-        data = str(f.read(1024*1024))
-
-    v0 = all([s in data for s in v1_strings]) and not "qzeros" in data
-    v1 = all([s in data for s in v2_strings])
-    v2 = all([s in data for s in v3_strings])
-
-    if v2:
-        if v0 or v1:
-            logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}")
-        return 2
-    if v1:
-        if v0 or v2:
-            logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}")
-        return 1
-    if v0:
-        if v1 or v2:
-            logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
-        return 0
-
-
-class HFTorch4BitInferenceModel(HFTorchInferenceModel):
-    def _load(self, save_model: bool, initial_load: bool) -> None:
-        utils.koboldai_vars.allowsp = True
-
-        # Make model path the same as the model name to make this consistent
-        # with the other loading method if it isn't a known model type. This
-        # code is not just a workaround for below, it is also used to make the
-        # behavior consistent with other loading methods - Henk717
-        # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]:
-        #     utils.koboldai_vars.custmodpth = utils.koboldai_vars.model
-
-        if self.model_name == "NeoCustom":
-            self.model_name = os.path.basename(
-                os.path.normpath(utils.koboldai_vars.custmodpth)
-            )
-            utils.koboldai_vars.model = self.model_name
-
-        self.init_model_config()
-
-        gpulayers = utils.args.breakmodel_gpulayers
-
-        try:
-            self.gpu_layers_list = [int(l) for l in gpulayers.split(",")]
-        except (ValueError, AttributeError):
-            self.gpu_layers_list = [utils.num_layers(self.model_config)]
-
-        tf_kwargs = {
-            "low_cpu_mem_usage": True,
-        }
-
-        # If we're using torch_lazy_loader, we need to get breakmodel config
-        # early so that it knows where to load the individual model tensors
-        if (
-            self.lazy_load
-            and utils.koboldai_vars.hascuda
-            and utils.koboldai_vars.breakmodel
-            and not utils.koboldai_vars.nobreakmodel
-        ):
-            self.breakmodel_device_config(self.model_config)
-
-        if self.lazy_load:
-            # If we're using lazy loader, we need to figure out what the model's hidden layers are called
-            with lazy_loader.use_lazy_load(
-                dematerialized_modules=True, use_accelerate_init_empty_weights=True
-            ):
-                try:
-                    metamodel = AutoModelForCausalLM.from_config(self.model_config)
-                    utils.layers_module_names = utils.get_layers_module_names(metamodel)
-                    utils.module_names = list(metamodel.state_dict().keys())
-                    utils.named_buffers = list(metamodel.named_buffers(recurse=True))
-                except Exception as e:
-                    logger.warning(f"Gave up on lazy loading due to {e}")
-                    self.lazy_load = False
-
-        # Download model from Huggingface if it does not exist, otherwise load locally
-        with self._maybe_use_float16(), lazy_loader.use_lazy_load(
-            enable=self.lazy_load,
-            callback=self._get_lazy_load_callback(utils.num_layers(self.model_config))
-            if self.lazy_load
-            else None,
-            dematerialized_modules=True,
-        ):
-            if self.lazy_load:
-                # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
-                tf_kwargs.pop("low_cpu_mem_usage", None)
-
-            if self.get_local_model_path():
-                # Model is stored locally, load it.
-                self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
-                self.tokenizer = self._get_tokenizer(self.get_local_model_path())
-            else:
-                # Model not stored locally, we need to download it.
-
-                # _rebuild_tensor patch for casting dtype and supporting LazyTensors
-                old_rebuild_tensor = torch._utils._rebuild_tensor
-
-                def new_rebuild_tensor(
-                    storage: Union[lazy_loader.LazyTensor, torch.Storage],
-                    storage_offset,
-                    shape,
-                    stride,
-                ):
-                    if not isinstance(storage, lazy_loader.LazyTensor):
-                        dtype = storage.dtype
-                    else:
-                        dtype = storage.storage_type.dtype
-                        if not isinstance(dtype, torch.dtype):
-                            dtype = storage.storage_type(0).dtype
-                    if dtype is torch.float32 and len(shape) >= 2:
-                        utils.koboldai_vars.fp32_model = True
-                    return old_rebuild_tensor(storage, storage_offset, shape, stride)
-
-                torch._utils._rebuild_tensor = new_rebuild_tensor
-                self.model = self._get_model(self.model_name, tf_kwargs)
-                self.tokenizer = self._get_tokenizer(self.model_name)
-                torch._utils._rebuild_tensor = old_rebuild_tensor
-
-                if save_model:
-                    self.tokenizer.save_pretrained(
-                        self.get_local_model_path(ignore_existance=True)
-                    )
-
-                    if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks:
-                        # Use save_pretrained to convert fp32 models to fp16,
-                        # unless we are using disk cache because save_pretrained
-                        # is not supported in that case
-                        self.model = self.model.half()
-                        self.model.save_pretrained(
-                            self.get_local_model_path(ignore_existance=True),
-                            max_shard_size="500MiB",
-                        )
-
-                    else:
-                        # For fp16 models, we can just copy the model files directly
-                        import transformers.configuration_utils
-                        import transformers.modeling_utils
-                        import transformers.file_utils
-                        import huggingface_hub
-
-                        # Save the config.json
-                        shutil.move(
-                            os.path.realpath(
-                                huggingface_hub.hf_hub_download(
-                                    self.model_name,
-                                    transformers.configuration_utils.CONFIG_NAME,
-                                    revision=utils.koboldai_vars.revision,
-                                    cache_dir="cache",
-                                    local_files_only=True,
-                                    legacy_cache_layout=False,
-                                )
-                            ),
-                            os.path.join(
-                                self.get_local_model_path(ignore_existance=True),
-                                transformers.configuration_utils.CONFIG_NAME,
-                            ),
-                        )
-
-                        if utils.num_shards is None:
-                            # Save the pytorch_model.bin or model.safetensors of an unsharded model
-                            any_success = False
-                            possible_checkpoint_names = [
-                                transformers.modeling_utils.WEIGHTS_NAME,
-                                "model.safetensors",
-                            ]
-
-                            for possible_checkpoint_name in possible_checkpoint_names:
-                                try:
-                                    shutil.move(
-                                        os.path.realpath(
-                                            huggingface_hub.hf_hub_download(
-                                                self.model_name,
-                                                possible_checkpoint_name,
-                                                revision=utils.koboldai_vars.revision,
-                                                cache_dir="cache",
-                                                local_files_only=True,
-                                                legacy_cache_layout=False,
-                                            )
-                                        ),
-                                        os.path.join(
-                                            self.get_local_model_path(
-                                                ignore_existance=True
-                                            ),
-                                            possible_checkpoint_name,
-                                        ),
-                                    )
-                                    any_success = True
-                                except Exception:
-                                    pass
-
-                            if not any_success:
-                                raise RuntimeError(f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'")
-                        else:
-                            # Handle saving sharded models
-
-                            with open(utils.from_pretrained_index_filename) as f:
-                                map_data = json.load(f)
-                            filenames = set(map_data["weight_map"].values())
-                            # Save the pytorch_model.bin.index.json of a sharded model
-                            shutil.move(
-                                os.path.realpath(utils.from_pretrained_index_filename),
-                                os.path.join(
-                                    self.get_local_model_path(ignore_existance=True),
-                                    transformers.modeling_utils.WEIGHTS_INDEX_NAME,
-                                ),
-                            )
-                            # Then save the pytorch_model-#####-of-#####.bin files
-                            for filename in filenames:
-                                shutil.move(
-                                    os.path.realpath(
-                                        huggingface_hub.hf_hub_download(
-                                            self.model_name,
-                                            filename,
-                                            revision=utils.koboldai_vars.revision,
-                                            cache_dir="cache",
-                                            local_files_only=True,
-                                            legacy_cache_layout=False,
-                                        )
-                                    ),
-                                    os.path.join(
-                                        self.get_local_model_path(
-                                            ignore_existance=True
-                                        ),
-                                        filename,
-                                    ),
-                                )
-                    shutil.rmtree("cache/")
-
-        if not self.lazy_load:
-            utils.layers_module_names = utils.get_layers_module_names(self.model)
-            utils.module_names = list(self.model.state_dict().keys())
-            utils.named_buffers = list(self.model.named_buffers(recurse=True))
-
-        if (
-            utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default
-            and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj")
-        ):
-            utils.koboldai_vars.badwordsids = [
-                [v]
-                for k, v in self.tokenizer.get_vocab().items()
-                if any(c in str(k) for c in "[]")
-            ]
-
-        self.patch_embedding()
-
-        self.model.kai_model = self
-        utils.koboldai_vars.modeldim = self.get_hidden_size()
-
-    def _get_model(self, location: str, tf_kwargs: Dict):
-        if not utils.koboldai_vars.custmodpth:
-            pass
-        groupsize = utils.koboldai_vars.gptq_groupsize
-
-        path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
-
-        if utils.koboldai_vars.gptq_version < 0:
-            utils.koboldai_vars.gptq_version = get_gptq_version(path_4bit)
-        gptq.modelutils.set_gptq_version(utils.koboldai_vars.gptq_version)
-
-        if legacy_groupsize is not False:
-            groupsize = legacy_groupsize
-
-        logger.info(f"Using GPTQ file: {path_4bit}, {utils.koboldai_vars.gptq_bits}-bit model, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}")
-        if utils.koboldai_vars.model_type == "gptj":
-            model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
-        elif utils.koboldai_vars.model_type == "gpt_neox":
-            model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
-        elif utils.koboldai_vars.model_type == "llama":
-            model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
-        elif utils.koboldai_vars.model_type == "opt":
-            model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
-        elif utils.koboldai_vars.model_type == "mpt":
-            model = load_quant_offload(mpt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
-        else:
-            raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit")
-
-        return model
-
-    def _get_tokenizer(self, location: str):
-        if utils.koboldai_vars.model_type == "llama":
-            tokenizer = LlamaTokenizer.from_pretrained(utils.koboldai_vars.custmodpth)
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(utils.koboldai_vars.custmodpth)
-
-        return GenericTokenizer(tokenizer)

From cf886de18b0a4d653a3f78b4dadaf390536fa322 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 30 May 2023 19:15:20 +0200
Subject: [PATCH 083/113] Remove leftover values fro koboldai_settings.py

---
 koboldai_settings.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/koboldai_settings.py b/koboldai_settings.py
index ae8d33cc..cd8fdafa 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -920,13 +920,6 @@ class story_settings(settings):
         self.commentary_chance = 0
         self.commentary_enabled = False
 
-        # 4bit model vals
-        self.gptq_model = False
-        self.gptq_bits = -1
-        self.gptq_groupsize = -1
-        self.gptq_version = -1
-        self.gptq_file = None
-
         self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled"))
 
         ################### must be at bottom #########################

From b7838c7dde202502369c2461834076adfc4e22a3 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 4 Jun 2023 08:06:48 +0200
Subject: [PATCH 084/113] Fall back to autogptq if available and model not
 supported by gptq-koboldai

---
 .../inference_models/4bit_hf_torch/class.py   | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/modeling/inference_models/4bit_hf_torch/class.py b/modeling/inference_models/4bit_hf_torch/class.py
index 62f04bfb..7d7dfc00 100644
--- a/modeling/inference_models/4bit_hf_torch/class.py
+++ b/modeling/inference_models/4bit_hf_torch/class.py
@@ -10,6 +10,7 @@ import sys
 from typing import Union
 
 from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
+import hf_bleeding_edge
 from hf_bleeding_edge import AutoModelForCausalLM
 
 import utils
@@ -37,6 +38,13 @@ from gptq.opt import load_quant as opt_load_quant
 from gptq.mpt import load_quant as mpt_load_quant
 from gptq.offload import load_quant_offload
 
+autogptq_support = True
+try:
+    import auto_gptq
+    from auto_gptq import AutoGPTQForCausalLM
+except ImportError:
+    autogptq_support = False
+
 
 model_backend_name = "Huggingface GPTQ"
 
@@ -212,6 +220,26 @@ class model_backend(HFTorchInferenceModel):
             model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
         elif model_type == "mpt":
             model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+        elif autogptq_support:
+            # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
+            auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
+            auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
+            auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM
+            model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"))
+
+            # Patch in embeddings function
+            def get_input_embeddings(self):
+                return self.model.get_input_embeddings()
+
+            type(model).get_input_embeddings = get_input_embeddings
+
+            # Patch in args support..
+            def generate(self, *args, **kwargs):
+                """shortcut for model.generate"""
+                with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
+                    return self.model.generate(*args, **kwargs)
+
+            type(model).generate = generate
         else:
             raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
 

From 974328ed22ceca9a6e1a1c37ed135977c3429fee Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 4 Jun 2023 09:02:15 +0200
Subject: [PATCH 085/113] Add 4bit requirements to requirements.txt

---
 requirements.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index c98b7252..0707cebe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,4 +37,7 @@ pytest-html==3.2.0
 pytest-metadata==2.0.4
 requests-mock==1.10.0
 safetensors==0.3.1
+git+https://github.com/0cc4m/hf_bleeding_edge/
+--find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html gptq_koboldai==0.0.4
+einops
 peft==0.3.0

From 05a0bfe6c4bac8a1f7c070203cb69d1825a70e4e Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 4 Jun 2023 09:44:28 +0200
Subject: [PATCH 086/113] Don't show HF support if no HF model files are found

---
 .../inference_models/generic_hf_torch/class.py    | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py
index a0b7b4cb..b56a7c45 100644
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -9,6 +9,8 @@ from typing import Union
 from transformers import GPTNeoForCausalLM, GPT2LMHeadModel
 from hf_bleeding_edge import AutoModelForCausalLM
 
+from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
+
 import utils
 import modeling.lazy_loader as lazy_loader
 import koboldai_settings
@@ -27,6 +29,19 @@ model_backend_name = "Huggingface"
 
 class model_backend(HFTorchInferenceModel):
 
+    def is_valid(self, model_name, model_path, menu_path):
+        base_is_valid = super().is_valid(model_name, model_path, menu_path)
+        path = False
+        gen_path = "models/{}".format(model_name.replace('/', '_'))
+        if model_path is not None and os.path.exists(model_path):
+            path = model_path
+        elif os.path.exists(gen_path):
+            path = gen_path
+
+        fnames = [WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME]
+
+        return base_is_valid and any(os.path.exists(os.path.join(path, fname)) for fname in fnames)
+
     def _initialize_model(self):
         return
 

From c82625490a110bd5799463fa05f6ebc710e3516e Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 4 Jun 2023 12:31:24 +0200
Subject: [PATCH 087/113] Rename gptq backend folder

---
 .../inference_models/{4bit_hf_torch => gptq_hf_torch}/class.py    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename modeling/inference_models/{4bit_hf_torch => gptq_hf_torch}/class.py (100%)

diff --git a/modeling/inference_models/4bit_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
similarity index 100%
rename from modeling/inference_models/4bit_hf_torch/class.py
rename to modeling/inference_models/gptq_hf_torch/class.py

From b35f61e987841bd79dacdbe5c8b1cf6c75735f01 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 4 Jun 2023 15:40:12 +0200
Subject: [PATCH 088/113] Basic exllama plugin

---
 modeling/inference_models/exllama/class.py | 277 +++++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100644 modeling/inference_models/exllama/class.py

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
new file mode 100644
index 00000000..0160ed4b
--- /dev/null
+++ b/modeling/inference_models/exllama/class.py
@@ -0,0 +1,277 @@
+from __future__ import annotations
+
+import time, json
+import torch
+import requests
+import numpy as np
+from typing import List, Optional, Union
+import os
+import glob
+from pathlib import Path
+import re
+
+import utils
+from logger import logger
+
+from modeling.inference_model import (
+    GenerationResult,
+    GenerationSettings,
+    InferenceModel,
+    ModelCapabilities,
+)
+
+from modeling.tokenizer import GenericTokenizer
+
+from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
+from transformers import LlamaTokenizer
+from exllama.generator import ExLlamaGenerator
+
+import traceback
+
+model_backend_name = "ExLlama"
+
+
+def load_model_gptq_settings(path):
+    try:
+        js = json.load(open(path + "/config.json", "r"))
+    except Exception as e:
+        return False
+
+    gptq_model = False
+    gptq_file = False
+
+    gptq_legacy_files = glob.glob(os.path.join(path, "4bit*.safetensors"))
+    if "gptq_bits" in js:
+        gptq_model = True
+        gptq_file = os.path.join(path, "model.safetensors")
+    elif gptq_legacy_files:
+        gptq_model = True
+        gptq_file = gptq_legacy_files[0]
+        fname = Path(gptq_file).parts[-1]
+        g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+
+    return gptq_model, gptq_file
+
+
+class model_backend(InferenceModel):
+    def __init__(self) -> None:
+        super().__init__()
+        self.model_config = None
+
+        self.model = None
+        self.tokenizer = None
+
+        self.model_name = None
+        self.path = None
+
+    def is_valid(self, model_name, model_path, menu_path):
+        gptq_model, _ = load_model_gptq_settings(model_path)
+        try:
+            self.model_config = self._load_config(model_name, model_path)
+            return self.model_config and gptq_model
+        except:
+            return False
+
+    def get_local_model_path(self):
+        return self.path or os.path.join("models", self.model_name.replace("/", "_"))
+
+    def _load_config(self, model_name, model_path):
+        if model_path is not None and os.path.exists(model_path):
+            return ExLlamaConfig(os.path.join(model_path, "config.json"))
+        if(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
+            return ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json"))
+        return False
+
+    def _load(self, save_model: bool, initial_load: bool) -> None:
+        self.model = self._get_model(self.get_local_model_path(), {})
+        self.tokenizer = self._get_tokenizer(os.path.join(self.get_local_model_path(), "tokenizer.model"))
+
+        self.cache = ExLlamaCache(self.model)
+
+        self.generator = ExLlamaGenerator(self.model, self.tokenizer.tokenizer, self.cache)
+
+    def _post_load(self) -> None:
+        # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
+        self.tokenizer.add_bos_token = False
+
+        # HF transformers no longer supports decode_with_prefix_space
+        # We work around this by wrapping decode, encode, and __call__
+        # with versions that work around the 'prefix space' misfeature
+        # of sentencepiece.
+        vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size))
+        has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")}
+
+        # Wrap 'decode' with a method that always returns text starting with a space
+        # when the head token starts with a space. This is what 'decode_with_prefix_space'
+        # used to do, and we implement it using the same technique (building a cache of
+        # tokens that should have a prefix space, and then prepending a space if the first
+        # token is in this set.) We also work around a bizarre behavior in which decoding
+        # a single token 13 behaves differently than decoding a squence containing only [13].
+        original_decode = type(self.tokenizer.tokenizer).decode
+        def decode_wrapper(self, token_ids, *args, **kwargs):
+            first = None
+            # Note, the code below that wraps single-value token_ids in a list
+            # is to work around this wonky behavior:
+            #   >>> t.decode(13)
+            #   '<0x0A>'
+            #   >>> t.decode([13])
+            #   '\n'
+            # Not doing this causes token streaming to receive <0x0A> characters
+            # instead of newlines.
+            if isinstance(token_ids, int):
+                first = token_ids
+                token_ids = [first]
+            elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor
+                # Tensors don't support the Python standard of 'empty is False'
+                # and the special case of dimension 0 tensors also needs to be
+                # handled separately.
+                if token_ids.dim() == 0:
+                    first = int(token_ids.item())
+                    token_ids = [first]
+                elif len(token_ids) > 0:
+                    first = int(token_ids[0])
+            elif token_ids is not None and len(token_ids) > 0:
+                first = token_ids[0]
+            result = original_decode(self, token_ids, *args, **kwargs)
+            if first is not None and first in has_prefix_space:
+                result = " " + result
+            return result
+        # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it
+        object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer))
+
+        # Wrap encode and __call__ to work around the 'prefix space' misfeature also.
+        # The problem is that "Bob" at the start of text is encoded as if it is
+        # " Bob". This creates a problem because it means you can't split text, encode
+        # the pieces, concatenate the tokens, decode them, and get the original text back.
+        # The workaround is to prepend a known token that (1) starts with a space; and
+        # (2) is not the prefix of any other token. After searching through the vocab
+        # " ," (space comma) is the only token containing only printable ascii characters
+        # that fits this bill. By prepending ',' to the text, the original encode
+        # method always returns [1919, ...], where the tail of the sequence is the
+        # actual encoded result we want without the prefix space behavior.
+        original_encode = type(self.tokenizer.tokenizer).encode
+        def encode_wrapper(self, text, *args, **kwargs):
+            if type(text) is str:
+                text = ',' + text
+                result = original_encode(self, text, *args, **kwargs)
+                result = result[1:]
+            else:
+                result = original_encode(self, text, *args, **kwargs)
+            return result
+        object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer))
+
+        # Since 'encode' is documented as being deprecated, also override __call__.
+        # This doesn't appear to currently be used by KoboldAI, but doing so
+        # in case someone uses it in the future.
+        original_call = type(self.tokenizer.tokenizer).__call__
+        def call_wrapper(self, text, *args, **kwargs):
+            if type(text) is str:
+                text = ',' + text
+                result = original_call(self, text, *args, **kwargs)
+                result = result[1:]
+            else:
+                result = original_call(self, text, *args, **kwargs)
+            return result
+        object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
+
+    def _raw_generate(
+        self,
+        prompt_tokens: Union[List[int], torch.Tensor],
+        max_new: int,
+        gen_settings: GenerationSettings,
+        single_line: bool = False,
+        batch_count: int = 1,
+        seed: Optional[int] = None,
+        **kwargs,
+    ) -> GenerationResult:
+        if not isinstance(prompt_tokens, torch.Tensor):
+            gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
+        else:
+            gen_in = prompt_tokens
+
+        self.generator.settings.temperature = max(gen_settings.temp, 0.01)
+        self.generator.settings.top_k = gen_settings.top_k if gen_settings.top_k > 0 else 10000
+        self.generator.settings.top_p = gen_settings.top_p
+        self.generator.settings.min_p = 0.0
+
+        self.generator.gen_begin(gen_in)
+
+        for i in range(max_new):
+            token = self.generator.gen_single_token()
+            if token.item() == self.tokenizer.eos_token_id: break
+
+        return GenerationResult(
+            model=self,
+            out_batches=np.array(
+                self.generator.sequence[:, gen_in.size(1):],
+            ),
+            prompt=prompt_tokens,
+            is_whole_generation=True,
+            single_line=single_line,
+        )
+
+    def _get_model(self, location: str, tf_kwargs: Dict):
+        _, self.model_config.model_path = load_model_gptq_settings(location)
+        return ExLlama(self.model_config)
+
+    def _get_tokenizer(self, location: str):
+        tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location))
+        tokenizer._koboldai_header = tokenizer.encode("")
+        return tokenizer
+
+    def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+        requested_parameters = []
+        gpu_count = torch.cuda.device_count()
+        layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
+        requested_parameters.append({
+                                        "uitype": "Valid Display",
+                                        "unit": "text",
+                                        "label": "Current Allocated Layers: %1/{}".format(layer_count), #%1 will be the validation value
+                                        "id": "valid_layers",
+                                        "max": layer_count,
+                                        "step": 1,
+                                        "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="},
+                                        "menu_path": "Layers",
+                                        "extra_classes": "",
+                                        "refresh_model_inputs": False
+                                    })
+        for i in range(gpu_count):
+            requested_parameters.append({
+                                            "uitype": "slider",
+                                            "unit": "int",
+                                            "label": "{} Layers".format(torch.cuda.get_device_name(i)),
+                                            "id": "{}_Layers".format(i),
+                                            "min": 0,
+                                            "max": layer_count,
+                                            "step": 1,
+                                            "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="},
+                                            "check_message": "The sum of assigned layers must equal {}".format(layer_count),
+                                            "default": [layer_count if i == 0 else 0],
+                                            "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)),
+                                            "menu_path": "Layers",
+                                            "extra_classes": "",
+                                            "refresh_model_inputs": False
+                                        })
+
+        return requested_parameters
+
+    def set_input_parameters(self, parameters):
+        gpu_count = torch.cuda.device_count()
+        layers = []
+        for i in range(gpu_count):
+            if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric():
+                layers.append(int(parameters["{}_Layers".format(i)]))
+            elif isinstance(parameters["{}_Layers".format(i)], str):
+                 layers.append(None)
+            else:
+                layers.append(parameters["{}_Layers".format(i)])
+
+        self.layers = layers
+        for i, l in enumerate(layers):
+            if l > 0:
+                self.model_config.device_map.layers.extend([f"cuda:{i}"] * l)
+        self.model_config.device_map.lm_head = "cuda:0"
+        self.model_config.device_map.norm = "cuda:0"
+
+        self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
+        self.path = parameters['path'] if 'path' in parameters else None

From 94520d5c80c571f0ae97d92c7641f743cf566f6b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 5 Jun 2023 18:43:57 +0200
Subject: [PATCH 089/113] Fix exllama model unload

---
 modeling/inference_models/exllama/class.py | 40 +++++++++++++++++++---
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 0160ed4b..db1728cf 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -9,6 +9,8 @@ import os
 import glob
 from pathlib import Path
 import re
+import warnings
+import gc
 
 import utils
 from logger import logger
@@ -26,8 +28,6 @@ from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
 from transformers import LlamaTokenizer
 from exllama.generator import ExLlamaGenerator
 
-import traceback
-
 model_backend_name = "ExLlama"
 
 
@@ -60,8 +60,10 @@ class model_backend(InferenceModel):
 
         self.model = None
         self.tokenizer = None
+        self.cache = None
+        self.generator = None
 
-        self.model_name = None
+        self.model_name = ""
         self.path = None
 
     def is_valid(self, model_name, model_path, menu_path):
@@ -84,7 +86,7 @@ class model_backend(InferenceModel):
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
         self.model = self._get_model(self.get_local_model_path(), {})
-        self.tokenizer = self._get_tokenizer(os.path.join(self.get_local_model_path(), "tokenizer.model"))
+        self.tokenizer = self._get_tokenizer(self.get_local_model_path()))
 
         self.cache = ExLlamaCache(self.model)
 
@@ -174,6 +176,33 @@ class model_backend(InferenceModel):
             return result
         object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
 
+    def unload(self):
+        self.model_config = None
+
+        self.model = None
+        self.tokenizer = None
+        self.cache = None
+        self.generator = None
+
+        self.model_name = ""
+        self.path = None
+
+        with torch.no_grad():
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated")
+                for tensor in gc.get_objects():
+                    try:
+                        if torch.is_tensor(tensor):
+                            tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype))
+                    except:
+                        pass
+        gc.collect()
+        try:
+            with torch.no_grad():
+                torch.cuda.empty_cache()
+        except:
+            pass
+
     def _raw_generate(
         self,
         prompt_tokens: Union[List[int], torch.Tensor],
@@ -184,6 +213,9 @@ class model_backend(InferenceModel):
         seed: Optional[int] = None,
         **kwargs,
     ) -> GenerationResult:
+        if seed:
+            torch.manual_seed(seed)
+
         if not isinstance(prompt_tokens, torch.Tensor):
             gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
         else:

From 39dfb1845570718d31490273bcb008718419b54e Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 6 Jun 2023 19:21:34 +0200
Subject: [PATCH 090/113] Replace exllama samplers with kobold's inbuilt ones

---
 modeling/inference_models/exllama/class.py | 56 +++++++++++++++++++++-
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index db1728cf..3ff38d33 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -15,6 +15,10 @@ import gc
 import utils
 from logger import logger
 
+from modeling import warpers
+from modeling.warpers import Warper
+from modeling.stoppers import Stoppers
+from modeling.post_token_hooks import PostTokenHooks
 from modeling.inference_model import (
     GenerationResult,
     GenerationSettings,
@@ -30,6 +34,11 @@ from exllama.generator import ExLlamaGenerator
 
 model_backend_name = "ExLlama"
 
+# When set to true, messages will appear in the console if samplers are not
+# changing the scores. Keep in mind some samplers don't always change the
+# scores for each token.
+LOG_SAMPLER_NO_EFFECT = False
+
 
 def load_model_gptq_settings(path):
     try:
@@ -86,7 +95,7 @@ class model_backend(InferenceModel):
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
         self.model = self._get_model(self.get_local_model_path(), {})
-        self.tokenizer = self._get_tokenizer(self.get_local_model_path()))
+        self.tokenizer = self._get_tokenizer(self.get_local_model_path())
 
         self.cache = ExLlamaCache(self.model)
 
@@ -203,6 +212,34 @@ class model_backend(InferenceModel):
         except:
             pass
 
+    def _apply_warpers(
+        self, scores: torch.Tensor, input_ids: torch.Tensor
+    ) -> torch.Tensor:
+        warpers.update_settings()
+
+        if LOG_SAMPLER_NO_EFFECT:
+            pre = torch.Tensor(scores)
+
+        for sid in utils.koboldai_vars.sampler_order:
+            warper = Warper.from_id(sid)
+
+            if not warper.value_is_valid():
+                continue
+
+            if warper == warpers.RepetitionPenalty:
+                # Rep pen needs more data than other samplers
+                scores = warper.torch(scores, input_ids=input_ids)
+            else:
+                scores = warper.torch(scores)
+
+            assert scores is not None, f"Scores are None; warper '{warper}' is to blame"
+
+            if LOG_SAMPLER_NO_EFFECT:
+                if torch.equal(pre, scores):
+                    logger.info(warper, "had no effect on the scores.")
+                pre = torch.Tensor(scores)
+        return scores
+
     def _raw_generate(
         self,
         prompt_tokens: Union[List[int], torch.Tensor],
@@ -228,8 +265,23 @@ class model_backend(InferenceModel):
 
         self.generator.gen_begin(gen_in)
 
+        # from pudb.remote import set_trace
+        # set_trace(term_size=(200, 60))
+
         for i in range(max_new):
-            token = self.generator.gen_single_token()
+            logits = self.model.forward(self.generator.sequence[:, -1:], self.cache)
+            logits[:, :, self.tokenizer.bos_token_id] = -10000.0
+
+            logits = torch.unsqueeze(logits[0, -1, :], 0)
+
+            scores = self._apply_warpers(logits, gen_in)
+
+            scores = torch.softmax(scores, dim=-1)
+
+            token = torch.multinomial(scores, 1)
+
+            self.generator.gen_accept_token(token)
+
             if token.item() == self.tokenizer.eos_token_id: break
 
         return GenerationResult(

From 47b371b9d3a21c341e1386c523ec87c760393ff7 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 6 Jun 2023 19:51:38 +0200
Subject: [PATCH 091/113] Fix multigpu

---
 modeling/inference_models/exllama/class.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 3ff38d33..b17d04bf 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -295,7 +295,11 @@ class model_backend(InferenceModel):
         )
 
     def _get_model(self, location: str, tf_kwargs: Dict):
+        if not self.model_config:
+            ExLlamaConfig(os.path.join(location, "config.json"))
+
         _, self.model_config.model_path = load_model_gptq_settings(location)
+        # self.model_config.gpu_peer_fix = True
         return ExLlama(self.model_config)
 
     def _get_tokenizer(self, location: str):
@@ -351,6 +355,7 @@ class model_backend(InferenceModel):
                 layers.append(parameters["{}_Layers".format(i)])
 
         self.layers = layers
+        self.model_config.device_map.layers = []
         for i, l in enumerate(layers):
             if l > 0:
                 self.model_config.device_map.layers.extend([f"cuda:{i}"] * l)

From 12df8220fb2d6122ee828c0910943a8e08c7ebb4 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 23 May 2023 06:59:28 +0200
Subject: [PATCH 092/113] Add gpt_bigcode support, fix 8-bit GPTQ incoherence

---
 docs/gptq-whl-links.html                         | 3 +++
 environments/huggingface.yml                     | 2 +-
 modeling/inference_models/gptq_hf_torch/class.py | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
index 34d05691..0808dbc6 100644
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -11,3 +11,6 @@
 
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl</a>
+
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-23/gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-23/gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl</a>
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index acba0648..79258b60 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -47,6 +47,6 @@ dependencies:
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai==0.0.4
+    - gptq_koboldai==0.0.5
     - einops
     - peft==0.3.0
diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 7d7dfc00..0cc1da8d 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -35,6 +35,7 @@ from gptq.gptj import load_quant as gptj_load_quant
 from gptq.gptneox import load_quant as gptneox_load_quant
 from gptq.llama import load_quant as llama_load_quant
 from gptq.opt import load_quant as opt_load_quant
+from gptq.bigcode import load_quant as bigcode_load_quant
 from gptq.mpt import load_quant as mpt_load_quant
 from gptq.offload import load_quant_offload
 
@@ -220,6 +221,8 @@ class model_backend(HFTorchInferenceModel):
             model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
         elif model_type == "mpt":
             model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+        elif model_type == "gpt_bigcode":
+            model = load_quant_offload(bigcode_load_quant, location, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list).half()
         elif autogptq_support:
             # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
             auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig

From 0001ae00ab76e94a1743cbd8cdacc5f2483afce0 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 12 Jun 2023 07:18:22 +0200
Subject: [PATCH 093/113] Add v2 with bias support (e.g. for Tulu-30b)

---
 .../inference_models/gptq_hf_torch/class.py   | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 0cc1da8d..d07aef23 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -96,17 +96,17 @@ def get_gptq_version(fpath):
     v2 = all([s in data for s in v3_strings])
 
     if v2:
-        if v0 or v1:
-            logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}")
-        return 2
+        if v0:
+            logger.warning(f"GPTQ model identified as v2, but v0={v0}")
+        return 2, v1
     if v1:
         if v0 or v2:
             logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}")
-        return 1
+        return 1, False
     if v0:
         if v1 or v2:
             logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
-        return 0
+        return 0, False
 
 
 class model_backend(HFTorchInferenceModel):
@@ -203,26 +203,27 @@ class model_backend(HFTorchInferenceModel):
 
     def _get_model(self, location: str, tf_kwargs: Dict):
         gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
+        v2_bias = False
 
         if gptq_version < 0:
-            gptq_version = get_gptq_version(gptq_file)
+            gptq_version, v2_bias = get_gptq_version(gptq_file)
         gptq.modelutils.set_gptq_version(gptq_version)
 
         model_type = self.get_model_type()
 
-        logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}, groupsize {gptq_groupsize}")
+        logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}")
         if model_type == "gptj":
-            model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+            model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
         elif model_type == "gpt_neox":
-            model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+            model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
         elif model_type == "llama":
-            model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+            model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
         elif model_type == "opt":
-            model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+            model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
         elif model_type == "mpt":
-            model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list)
+            model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
         elif model_type == "gpt_bigcode":
-            model = load_quant_offload(bigcode_load_quant, location, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list).half()
+            model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
         elif autogptq_support:
             # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
             auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig

From ebf7e2cf57efcab1a4998fc85029566700ce9497 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 12 Jun 2023 08:27:30 +0200
Subject: [PATCH 094/113] Update GPTQ module to 0.0.6

---
 docs/gptq-whl-links.html     | 3 +++
 environments/huggingface.yml | 2 +-
 environments/rocm.yml        | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
index 0808dbc6..b993d9bd 100644
--- a/docs/gptq-whl-links.html
+++ b/docs/gptq-whl-links.html
@@ -14,3 +14,6 @@
 
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-23/gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl</a>
 <a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-23/gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl</a>
+
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl</a>
+<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl</a>
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 79258b60..2c996ff9 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -47,6 +47,6 @@ dependencies:
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
     - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
-    - gptq_koboldai==0.0.5
+    - gptq_koboldai==0.0.6
     - einops
     - peft==0.3.0
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 7ef282cc..b85cfd74 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -42,5 +42,6 @@ dependencies:
     - pydub
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
+    - git+https://github.com/0cc4m/GPTQ-for-LLaMa@0.0.6
     - einops
     - peft==0.3.0

From 0c7eaefb1acc522eeed0b2dc1af78ec894b84a8b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 13 Jun 2023 10:11:29 +0200
Subject: [PATCH 095/113] Fix AMD ROCm exllama inference

---
 modeling/inference_models/exllama/class.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index b17d04bf..37681b4f 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -362,5 +362,10 @@ class model_backend(InferenceModel):
         self.model_config.device_map.lm_head = "cuda:0"
         self.model_config.device_map.norm = "cuda:0"
 
+        self.model_config.rmsnorm_no_half2 = bool(torch.version.hip)
+        self.model_config.rope_no_half2 = bool(torch.version.hip)
+        self.model_config.matmul_no_half2 = bool(torch.version.hip)
+        self.model_config.silu_no_half2 = bool(torch.version.hip)
+
         self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
         self.path = parameters['path'] if 'path' in parameters else None

From e874f0c1c26501a0c2592b3acde8a3a271a7c50d Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 19 Jun 2023 19:05:31 +0200
Subject: [PATCH 096/113] Add token streaming support for exllama

---
 modeling/inference_models/exllama/class.py | 26 ++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 37681b4f..614a3de1 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -75,6 +75,25 @@ class model_backend(InferenceModel):
         self.model_name = ""
         self.path = None
 
+        self.post_token_hooks = [
+            PostTokenHooks.stream_tokens,
+        ]
+
+        self.stopper_hooks = [
+            Stoppers.core_stopper,
+            Stoppers.dynamic_wi_scanner,
+            Stoppers.singleline_stopper,
+            Stoppers.chat_mode_stopper,
+            Stoppers.stop_sequence_stopper,
+        ]
+
+        self.capabilties = ModelCapabilities(
+            embedding_manipulation=False,
+            post_token_hooks=True,
+            stopper_hooks=False,
+            post_token_probs=False,
+        )
+
     def is_valid(self, model_name, model_path, menu_path):
         gptq_model, _ = load_model_gptq_settings(model_path)
         try:
@@ -265,11 +284,8 @@ class model_backend(InferenceModel):
 
         self.generator.gen_begin(gen_in)
 
-        # from pudb.remote import set_trace
-        # set_trace(term_size=(200, 60))
-
         for i in range(max_new):
-            logits = self.model.forward(self.generator.sequence[:, -1:], self.cache)
+            logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache)
             logits[:, :, self.tokenizer.bos_token_id] = -10000.0
 
             logits = torch.unsqueeze(logits[0, -1, :], 0)
@@ -282,6 +298,8 @@ class model_backend(InferenceModel):
 
             self.generator.gen_accept_token(token)
 
+            self._post_token_gen(self.generator.sequence)
+
             if token.item() == self.tokenizer.eos_token_id: break
 
         return GenerationResult(

From a191855b37407f91f03576814a1cb4b548100183 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 19 Jun 2023 19:14:04 +0200
Subject: [PATCH 097/113] Track token generation progress

---
 modeling/inference_models/exllama/class.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 614a3de1..811f8da1 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -300,8 +300,12 @@ class model_backend(InferenceModel):
 
             self._post_token_gen(self.generator.sequence)
 
+            utils.koboldai_vars.generated_tkns += 1
+
             if token.item() == self.tokenizer.eos_token_id: break
 
+        utils.koboldai_vars.generated_tkns = max_new
+
         return GenerationResult(
             model=self,
             out_batches=np.array(

From e8741a1b5709f98187fb6ecd3d3d35fa0b9cd57c Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 20 Jun 2023 09:19:43 +0200
Subject: [PATCH 098/113] Disable scaled_dot_product_attention if torch version
 < 2

---
 modeling/inference_models/exllama/class.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 811f8da1..995f5874 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -282,7 +282,7 @@ class model_backend(InferenceModel):
         self.generator.settings.top_p = gen_settings.top_p
         self.generator.settings.min_p = 0.0
 
-        self.generator.gen_begin(gen_in)
+        self.generator.gen_begin_reuse(gen_in)
 
         for i in range(max_new):
             logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache)
@@ -384,10 +384,15 @@ class model_backend(InferenceModel):
         self.model_config.device_map.lm_head = "cuda:0"
         self.model_config.device_map.norm = "cuda:0"
 
+        # Disable half2 for HIP
         self.model_config.rmsnorm_no_half2 = bool(torch.version.hip)
         self.model_config.rope_no_half2 = bool(torch.version.hip)
         self.model_config.matmul_no_half2 = bool(torch.version.hip)
         self.model_config.silu_no_half2 = bool(torch.version.hip)
 
+        # Disable scaled_dot_product_attention if torch version < 2
+        if torch.__version__.startswith("1."):
+            self.model_config.sdp_thd = 0
+
         self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
         self.path = parameters['path'] if 'path' in parameters else None

From adad81639dbe9867039c25874c73d801fb48df86 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 21 Jun 2023 15:47:46 +0200
Subject: [PATCH 099/113] Remove rocm gptq install from environments file

---
 environments/rocm.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/environments/rocm.yml b/environments/rocm.yml
index b85cfd74..7ef282cc 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -42,6 +42,5 @@ dependencies:
     - pydub
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
-    - git+https://github.com/0cc4m/GPTQ-for-LLaMa@0.0.6
     - einops
     - peft==0.3.0

From c753671ac14850a2528c0e1028816a12ca8005ac Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 27 Jun 2023 07:39:37 +0200
Subject: [PATCH 100/113] Add exllama superhot positional embeddings
 compression support

---
 modeling/inference_models/exllama/class.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 995f5874..19478cc8 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -106,11 +106,18 @@ class model_backend(InferenceModel):
         return self.path or os.path.join("models", self.model_name.replace("/", "_"))
 
     def _load_config(self, model_name, model_path):
+        config = False
         if model_path is not None and os.path.exists(model_path):
-            return ExLlamaConfig(os.path.join(model_path, "config.json"))
-        if(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
-            return ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json"))
-        return False
+            config = ExLlamaConfig(os.path.join(model_path, "config.json"))
+        if not config and os.path.exists("models/{}".format(model_name.replace('/', '_'))):
+            config = ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json"))
+
+        if config and "superhot" in model_name.lower():
+            # Set compress_pos_emb factor
+            config.max_seq_len = 8192
+            config.compress_pos_emb = 4.0
+
+        return config
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
         self.model = self._get_model(self.get_local_model_path(), {})
@@ -277,11 +284,6 @@ class model_backend(InferenceModel):
         else:
             gen_in = prompt_tokens
 
-        self.generator.settings.temperature = max(gen_settings.temp, 0.01)
-        self.generator.settings.top_k = gen_settings.top_k if gen_settings.top_k > 0 else 10000
-        self.generator.settings.top_p = gen_settings.top_p
-        self.generator.settings.min_p = 0.0
-
         self.generator.gen_begin_reuse(gen_in)
 
         for i in range(max_new):

From 0e4b6571d5f5fb1104fdea7194f2f2913ef243ec Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 28 Jun 2023 22:50:04 +0200
Subject: [PATCH 101/113] Fix non-tuple return from gptq function

---
 modeling/inference_models/exllama/class.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 19478cc8..1caa2afd 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -44,7 +44,7 @@ def load_model_gptq_settings(path):
     try:
         js = json.load(open(path + "/config.json", "r"))
     except Exception as e:
-        return False
+        return False, False
 
     gptq_model = False
     gptq_file = False

From ed7ad00b593f431af59a3cd7315dc36ca2940c6f Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 15 Jul 2023 22:55:17 +0200
Subject: [PATCH 102/113] Move GPTQ readme changes to separate file

---
 README.md      | 52 --------------------------------------------------
 README_GPTQ.md | 50 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 52 deletions(-)
 create mode 100644 README_GPTQ.md

diff --git a/README.md b/README.md
index 5f4bc5c7..789b78d1 100644
--- a/README.md
+++ b/README.md
@@ -1,55 +1,3 @@
-## This is a fork of KoboldAI that implements 4bit GPTQ quantized support to include Llama.
-
-### Install/Use Guide
-(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
-
-#### Installation
-In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
-
-Note: do not run your command prompt as administrator/with elevated priviledges, reports suggest this leads to problems.
-
-`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
-
-`cd KoboldAI`
-
-Next step, (Windows) subfolder mode or B: option doesn't matter choose either
-
-* [if on Windows]
-  ```
-  install_requirements.bat
-  ```
-  * if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
-
-* [if on Linux with Nvidia] 
-  ```
-  ./install_requirements.sh
-  ```
-* [if on Linux with AMD]
-  ```
-  ./install_requirements.sh rocm
-  ./commandline-rocm.sh
-  pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa@c884b421a233f9603d8224c9b22c2d83dd2c1fc4
-  ```
-  * If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed
-  * If you get CUDA_HOME envar is not set run in env: 
-    `pip3 install torch --index-url https://download.pytorch.org/whl/rocm5.4.2 --force-reinstall`
-
-#### Setting up models
-If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
-
-Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
-
-Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
-
-So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, it'll load the quantized model.
-
-#### Running KoboldAI and loading 4bit models
-If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
-
-Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD]
-
-Switch to UI2, then load your model.
-
 ## KoboldAI - Your gateway to GPT writing
 
 This is a browser-based front-end for AI-assisted writing with multiple local & remote AI models. It offers the standard array of tools, including Memory, Author's Note, World Info, Save & Load, adjustable AI settings, formatting options, and the ability to import existing AI Dungeon adventures. You can also turn on Adventure mode and play the game like AI Dungeon Unleashed.
diff --git a/README_GPTQ.md b/README_GPTQ.md
new file mode 100644
index 00000000..e1961cb8
--- /dev/null
+++ b/README_GPTQ.md
@@ -0,0 +1,50 @@
+### Install/Use Guide
+(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
+
+#### Installation
+In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
+
+Note: do not run your command prompt as administrator/with elevated priviledges, reports suggest this leads to problems.
+
+`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
+
+`cd KoboldAI`
+
+Next step, (Windows) subfolder mode or B: option doesn't matter choose either
+
+* [if on Windows]
+  ```
+  install_requirements.bat
+  ```
+  * if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
+
+* [if on Linux with Nvidia] 
+  ```
+  ./install_requirements.sh
+  ```
+* [if on Linux with AMD]
+  ```
+  ./install_requirements.sh rocm
+  ./commandline-rocm.sh
+  pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa@c884b421a233f9603d8224c9b22c2d83dd2c1fc4
+  ```
+  * If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed
+  * If you get CUDA_HOME envar is not set run in env: 
+    `pip3 install torch --index-url https://download.pytorch.org/whl/rocm5.4.2 --force-reinstall`
+
+#### Setting up models
+If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
+
+Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
+
+Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
+
+So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, it'll load the quantized model.
+
+#### Running KoboldAI and loading 4bit models
+If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
+
+Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD]
+
+Switch to UI2, then load your model.
+

From 9aa6c5fbbfcb9a2f22f38fc9baa07e5baa033361 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 19 Jul 2023 06:56:09 +0200
Subject: [PATCH 103/113] Merge upstream changes, fix conflict, adapt backends
 to changes

---
 modeling/inference_models/exllama/class.py    |  1 +
 .../inference_models/gptq_hf_torch/class.py   | 50 ++++++-------------
 2 files changed, 16 insertions(+), 35 deletions(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 1caa2afd..21eba58e 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -32,6 +32,7 @@ from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
 from transformers import LlamaTokenizer
 from exllama.generator import ExLlamaGenerator
 
+model_backend_type = "GPTQ"
 model_backend_name = "ExLlama"
 
 # When set to true, messages will appear in the console if samplers are not
diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index d07aef23..16d3db91 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -18,13 +18,6 @@ import modeling.lazy_loader as lazy_loader
 import koboldai_settings
 from logger import logger, set_logger_verbosity
 
-try:
-    import breakmodel
-except ModuleNotFoundError as e:
-    # Breakmodel is only expected to work on GPU
-    if not utils.koboldai_vars.use_colab_tpu:
-        raise e
-
 from modeling.inference_models.hf_torch import HFTorchInferenceModel
 from modeling.tokenizer import GenericTokenizer
 
@@ -47,6 +40,7 @@ except ImportError:
     autogptq_support = False
 
 
+model_backend_type = "GPTQ"
 model_backend_name = "Huggingface GPTQ"
 
 
@@ -112,7 +106,7 @@ def get_gptq_version(fpath):
 class model_backend(HFTorchInferenceModel):
     def is_valid(self, model_name, model_path, menu_path):
         gptq_model, _, _, _, _ = load_model_gptq_settings(model_path)
-        return gptq_model
+        return bool(gptq_model)
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
         # Make model path the same as the model name to make this consistent
@@ -126,7 +120,7 @@ class model_backend(HFTorchInferenceModel):
 
         self.lazy_load = False
 
-        gpulayers = breakmodel.gpu_blocks
+        gpulayers = self.breakmodel_config.gpu_blocks
 
         try:
             self.gpu_layers_list = [int(l) for l in gpulayers.split(",")]
@@ -149,42 +143,28 @@ class model_backend(HFTorchInferenceModel):
             self.breakmodel_device_config(self.model_config)
 
         if self.lazy_load:
+            # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
+            tf_kwargs.pop("low_cpu_mem_usage", None)
+
             # If we're using lazy loader, we need to figure out what the model's hidden layers are called
-            with lazy_loader.use_lazy_load(
-                dematerialized_modules=True, use_accelerate_init_empty_weights=True
-            ):
+            with lazy_loader.use_lazy_load(dematerialized_modules=True):
                 try:
                     metamodel = AutoModelForCausalLM.from_config(self.model_config)
                     utils.layers_module_names = utils.get_layers_module_names(metamodel)
                     utils.module_names = list(metamodel.state_dict().keys())
                     utils.named_buffers = list(metamodel.named_buffers(recurse=True))
                 except Exception as e:
+                    if utils.args.panic:
+                        raise e
                     logger.warning(f"Gave up on lazy loading due to {e}")
                     self.lazy_load = False
 
-        # Download model from Huggingface if it does not exist, otherwise load locally
-        with self._maybe_use_float16(), lazy_loader.use_lazy_load(
-            enable=self.lazy_load,
-            callback=self._get_lazy_load_callback(utils.num_layers(self.model_config))
-            if self.lazy_load
-            else None,
-            dematerialized_modules=True,
-        ):
-            if self.lazy_load:
-                # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
-                tf_kwargs.pop("low_cpu_mem_usage", None)
-
-            if self.get_local_model_path():
-                # Model is stored locally, load it.
-                self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
-                self.tokenizer = self._get_tokenizer(self.get_local_model_path())
-            else:
-                raise NotImplementedError("GPTQ Model downloading not implemented")
-
-        if not self.lazy_load:
-            utils.layers_module_names = utils.get_layers_module_names(self.model)
-            utils.module_names = list(self.model.state_dict().keys())
-            utils.named_buffers = list(self.model.named_buffers(recurse=True))
+        if self.get_local_model_path():
+            # Model is stored locally, load it.
+            self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
+            self.tokenizer = self._get_tokenizer(self.get_local_model_path())
+        else:
+            raise NotImplementedError("GPTQ Model downloading not implemented")
 
         if (
             utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default

From c84d063be880672a30ecd1a8a48791b6ab12685e Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 19 Jul 2023 07:01:11 +0200
Subject: [PATCH 104/113] Revert settings changes

---
 koboldai_settings.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/koboldai_settings.py b/koboldai_settings.py
index 8ab134fa..ebd8c019 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -919,7 +919,7 @@ class story_settings(settings):
         # In percent!!!
         self.commentary_chance = 0
         self.commentary_enabled = False
-
+        
         self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled"))
 
         ################### must be at bottom #########################
@@ -1206,12 +1206,12 @@ class system_settings(settings):
     local_only_variables = ['lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 
                             'lua_koboldcore', 'regex_sl', 'acregex_ai', 'acregex_ui', 'comregex_ai', 'comregex_ui',
                             'sp', '_horde_pid', 'inference_config', 'image_pipeline', 
-                            'summarizer', 'summary_tokenizer', 'tts_model', 'rng_states', 'comregex_ai', 'comregex_ui']
+                            'summarizer', 'summary_tokenizer', 'tts_model', 'rng_states', 'comregex_ai', 'comregex_ui', 'colab_arg']
     no_save_variables = ['lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 
                          'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 
                          'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 'on_colab'
                          'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model',
-                         'generating_image', 'bit_8_available', 'bit_4_available', 'host', 'hascuda', 'usegpu', 'rng_states', 'comregex_ai', 'comregex_ui', 'git_repository', 'git_branch']
+                         'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states', 'comregex_ai', 'comregex_ui', 'git_repository', 'git_branch', 'colab_arg']
     settings_name = "system"
     def __init__(self, socketio, koboldai_var):
         self._socketio = socketio
@@ -1279,11 +1279,12 @@ class system_settings(settings):
         self.disable_output_formatting = False
         self.api_tokenizer_id = None
         self.port = 5000
+        self.colab_arg = False
         try:
             import google.colab
             self.on_colab = True
         except:
-            self.on_colab = False
+            self.on_colab = self.colab_arg
         print(f"Colab Check: {self.on_colab}, TPU: {self.use_colab_tpu}")
         self.horde_share = False
         self._horde_pid = None
@@ -1294,13 +1295,6 @@ class system_settings(settings):
         self.keep_img_gen_in_memory = False
         self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost
         self.experimental_features = False
-        # Check if repos/gptq exists for 4-bit mode
-        self.bit_4_available = True
-        try:
-            import gptq
-        except ImportError:
-            self.bit_4_available = False
-
         self.seen_messages = []
         self.git_repository = ""
         self.git_branch = ""

From 1c5da2bbf3b1dbe71599449e7953df3fc06ab301 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 19 Jul 2023 07:08:39 +0200
Subject: [PATCH 105/113] Move pip docs from KoboldAI into GPTQ repo

---
 docs/gptq-whl-links.html | 19 -------------------
 1 file changed, 19 deletions(-)
 delete mode 100644 docs/gptq-whl-links.html

diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html
deleted file mode 100644
index b993d9bd..00000000
--- a/docs/gptq-whl-links.html
+++ /dev/null
@@ -1,19 +0,0 @@
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl</a>
-
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl</a>
-
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-18-2/gptq_koboldai-0.0.3-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.3-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-18-2/gptq_koboldai-0.0.3-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.3-cp38-cp38-win_amd64.whl</a>
-
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-19-2/gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl</a>
-
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-23/gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-23/gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl</a>
-
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl</a>
-<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl</a>

From 19f511dc9f11c2f68f1c697f8d6b4f0521335c54 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 19 Jul 2023 07:12:37 +0200
Subject: [PATCH 106/113] Load GPTQ module from GPTQ repo docs

---
 environments/huggingface.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index c4cccf98..2cbefe7f 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -47,7 +47,7 @@ dependencies:
     - pydub
     - diffusers
     - git+https://github.com/0cc4m/hf_bleeding_edge/
-    - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
+    - --find-links=https://0cc4m.github.io/GPTQ-for-LLaMa/gptq-whl-links.html
     - gptq_koboldai==0.0.6
     - einops
     - peft==0.3.0

From 58908ab846f44671533a66fb866bedbc45a60198 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 19 Jul 2023 07:14:03 +0200
Subject: [PATCH 107/113] Revert aiserver.py changes

---
 aiserver.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 88a76454..0aa9bd4c 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -50,8 +50,6 @@ import multiprocessing
 import numpy as np
 from collections import OrderedDict
 from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type
-import glob
-from pathlib import Path
 
 import requests
 import html
@@ -1087,6 +1085,8 @@ def getmodelname():
 def get_hidden_size_from_model(model):
     return model.get_input_embeddings().embedding_dim
 
+
+
 #==================================================================#
 #  Allow the models to override some settings
 #==================================================================#
@@ -1162,7 +1162,6 @@ def loadmodelsettings():
         if(not koboldai_vars.gamestarted):
             koboldai_vars.authornotetemplate = koboldai_vars.setauthornotetemplate
 
-
 #==================================================================#
 #  Take settings from koboldai_vars and write them to client settings file
 #==================================================================#
@@ -1594,7 +1593,8 @@ def general_startup(override_args=None):
     
     if koboldai_vars.use_colab_tpu and args.model_backend == "Huggingface":
          args.model_backend = "Huggingface MTJ"
-
+         
+    
     if args.model:
         # At this point we have to try to load the model through the selected backend
         if args.model_backend not in model_backends:
@@ -1761,7 +1761,8 @@ def load_model(model_backend, initial_load=False):
 
     if 'model' in globals():
         model.unload()
-
+    
+    
     # If transformers model was selected & GPU available, ask to use CPU or GPU
     if(not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
         # loadmodelsettings()
@@ -1783,6 +1784,8 @@ def load_model(model_backend, initial_load=False):
     else:
         koboldai_vars.default_preset = koboldai_settings.default_preset
 
+                    
+    
     with use_custom_unpickler(RestrictedUnpickler):
         model = model_backends[model_backend]
         model.load(initial_load=initial_load, save_model=not (args.colab or args.cacheonly) or args.savemodel)
@@ -1791,7 +1794,7 @@ def load_model(model_backend, initial_load=False):
         koboldai_vars.model = os.path.basename(os.path.normpath(model.path))
         logger.info(koboldai_vars.model)
     logger.debug("Model Type: {}".format(koboldai_vars.model_type))
-
+    
     # TODO: Convert everywhere to use model.tokenizer
     if model:
         tokenizer = model.tokenizer

From 748e5ef318095d2d6f47ed6da3272699c96088af Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 23 Jul 2023 07:11:28 +0200
Subject: [PATCH 108/113] Add sliders for exllama context size and related
 methods

---
 modeling/inference_models/exllama/class.py    | 58 ++++++++++++++++---
 .../inference_models/gptq_hf_torch/class.py   |  4 +-
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index 21eba58e..aa37a7aa 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -50,7 +50,7 @@ def load_model_gptq_settings(path):
     gptq_model = False
     gptq_file = False
 
-    gptq_legacy_files = glob.glob(os.path.join(path, "4bit*.safetensors"))
+    gptq_legacy_files = glob.glob(os.path.join(path, "*4bit*.safetensors"))
     if "gptq_bits" in js:
         gptq_model = True
         gptq_file = os.path.join(path, "model.safetensors")
@@ -58,7 +58,7 @@ def load_model_gptq_settings(path):
         gptq_model = True
         gptq_file = gptq_legacy_files[0]
         fname = Path(gptq_file).parts[-1]
-        g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+        g = re.findall("(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
 
     return gptq_model, gptq_file
 
@@ -113,11 +113,6 @@ class model_backend(InferenceModel):
         if not config and os.path.exists("models/{}".format(model_name.replace('/', '_'))):
             config = ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json"))
 
-        if config and "superhot" in model_name.lower():
-            # Set compress_pos_emb factor
-            config.max_seq_len = 8192
-            config.compress_pos_emb = 4.0
-
         return config
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
@@ -366,6 +361,51 @@ class model_backend(InferenceModel):
                                             "refresh_model_inputs": False
                                         })
 
+        requested_parameters.append({
+            "uitype": "slider",
+            "unit": "int",
+            "label": "Maximum Context",
+            "id": "max_ctx",
+            "min": 2048,
+            "max": 16384,
+            "step": 512,
+            "default": 2048,
+            "tooltip": "The maximum context size the model supports",
+            "menu_path": "Configuration",
+            "extra_classes": "",
+            "refresh_model_inputs": False
+        })
+
+        requested_parameters.append({
+            "uitype": "slider",
+            "unit": "float",
+            "label": "Embedding Compression",
+            "id": "compress_emb",
+            "min": 1,
+            "max": 8,
+            "step": 0.25,
+            "default": 1,
+            "tooltip": "If the model requires compressed embeddings, set them here",
+            "menu_path": "Configuration",
+            "extra_classes": "",
+            "refresh_model_inputs": False
+        })
+
+        requested_parameters.append({
+            "uitype": "slider",
+            "unit": "float",
+            "label": "NTK alpha",
+            "id": "ntk_alpha",
+            "min": 1,
+            "max": 32,
+            "step": 0.25,
+            "default": 1,
+            "tooltip": "NTK alpha value",
+            "menu_path": "Configuration",
+            "extra_classes": "",
+            "refresh_model_inputs": False
+        })
+
         return requested_parameters
 
     def set_input_parameters(self, parameters):
@@ -387,6 +427,10 @@ class model_backend(InferenceModel):
         self.model_config.device_map.lm_head = "cuda:0"
         self.model_config.device_map.norm = "cuda:0"
 
+        self.model_config.max_seq_len = parameters["max_ctx"]
+        self.model_config.compress_pos_emb = parameters["compress_emb"]
+        self.model_config.alpha_value = parameters["ntk_alpha"]
+
         # Disable half2 for HIP
         self.model_config.rmsnorm_no_half2 = bool(torch.version.hip)
         self.model_config.rope_no_half2 = bool(torch.version.hip)
diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 16d3db91..157ebdbe 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -56,7 +56,7 @@ def load_model_gptq_settings(path):
     gptq_file = False
     gptq_version = -1
 
-    gptq_legacy_files = glob.glob(os.path.join(path, "4bit*.pt")) + glob.glob(os.path.join(path, "4bit*.safetensors"))
+    gptq_legacy_files = glob.glob(os.path.join(path, "*4bit*.pt")) + glob.glob(os.path.join(path, "*4bit*.safetensors"))
     if "gptq_bits" in js:
         gptq_model = True
         gptq_bits = js["gptq_bits"]
@@ -70,7 +70,7 @@ def load_model_gptq_settings(path):
         gptq_bits = 4
         gptq_file = gptq_legacy_files[0]
         fname = Path(gptq_file).parts[-1]
-        g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+        g = re.findall("(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
         gptq_groupsize = int(g[0]) if g else -1
         gptq_version = -1
 

From 09bb1021ddc548e4422d6426fe2c1867b6d152b8 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 23 Jul 2023 07:14:23 +0200
Subject: [PATCH 109/113] Fallback to transformers if hf_bleeding_edge not
 available

---
 modeling/inference_models/generic_hf_torch/class.py | 5 ++++-
 modeling/inference_models/gptq_hf_torch/class.py    | 7 +++++--
 modeling/inference_models/hf.py                     | 5 ++++-
 modeling/inference_models/hf_torch.py               | 5 ++++-
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py
index de89034b..5471ae43 100644
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -7,7 +7,10 @@ import shutil
 from typing import Union
 
 from transformers import GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
-from hf_bleeding_edge import AutoModelForCausalLM
+try:
+    from hf_bleeding_edge import AutoModelForCausalLM
+except ImportError:
+    from transformers import AutoModelForCausalLM
 
 from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
 
diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 157ebdbe..0819c8ae 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -10,8 +10,11 @@ import sys
 from typing import Union
 
 from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
-import hf_bleeding_edge
-from hf_bleeding_edge import AutoModelForCausalLM
+try:
+    import hf_bleeding_edge
+    from hf_bleeding_edge import AutoModelForCausalLM
+except ImportError:
+    from transformers import AutoModelForCausalLM
 
 import utils
 import modeling.lazy_loader as lazy_loader
diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index cd55c3ef..be0fb059 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -1,6 +1,9 @@
 import os, sys
 from typing import Optional
-from hf_bleeding_edge import AutoConfig
+try:
+    from hf_bleeding_edge import AutoConfig
+except ImportError:
+    from transformers import AutoConfig
 
 import warnings
 import utils
diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py
index f7bd7a0b..6372858f 100644
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -19,7 +19,10 @@ from transformers import (
     GPT2LMHeadModel,
     LogitsProcessorList,
 )
-from hf_bleeding_edge import AutoModelForCausalLM
+try:
+    from hf_bleeding_edge import AutoModelForCausalLM
+except ImportError:
+    from transformers import AutoModelForCausalLM
 
 import utils
 import modeling.lazy_loader as lazy_loader

From 31a984aa3d3c37e44b6114d3d5196167940181ee Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 23 Jul 2023 07:33:51 +0200
Subject: [PATCH 110/113] Automatically install exllama module

---
 environments/huggingface.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 82ea8f9b..e97f3e2e 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -52,3 +52,5 @@ dependencies:
     - einops
     - peft==0.3.0
     - scipy
+    - --find-links=https://0cc4m.github.io/exllama/exllama-whl-links.html
+    - exllama==0.0.6

From 49740aa5abf406f7b9f6a60e60e23815c3f7007f Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 23 Jul 2023 21:56:48 +0200
Subject: [PATCH 111/113] Fix ntk alpha

---
 modeling/inference_models/exllama/class.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index aa37a7aa..e3c7a874 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -430,6 +430,7 @@ class model_backend(InferenceModel):
         self.model_config.max_seq_len = parameters["max_ctx"]
         self.model_config.compress_pos_emb = parameters["compress_emb"]
         self.model_config.alpha_value = parameters["ntk_alpha"]
+        self.model_config.calculate_rotary_embedding_base()
 
         # Disable half2 for HIP
         self.model_config.rmsnorm_no_half2 = bool(torch.version.hip)

From 973aea12ea079e9c5de1e418b848a0407da7eab7 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 23 Jul 2023 22:07:34 +0200
Subject: [PATCH 112/113] Only import big python modules for GPTQ once they get
 used

---
 .../inference_models/gptq_hf_torch/class.py   | 50 +++++++++----------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 0819c8ae..81a33c70 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -9,13 +9,6 @@ import shutil
 import sys
 from typing import Union
 
-from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
-try:
-    import hf_bleeding_edge
-    from hf_bleeding_edge import AutoModelForCausalLM
-except ImportError:
-    from transformers import AutoModelForCausalLM
-
 import utils
 import modeling.lazy_loader as lazy_loader
 import koboldai_settings
@@ -24,23 +17,7 @@ from logger import logger, set_logger_verbosity
 from modeling.inference_models.hf_torch import HFTorchInferenceModel
 from modeling.tokenizer import GenericTokenizer
 
-# 4-bit dependencies
-import gptq
 from pathlib import Path
-from gptq.gptj import load_quant as gptj_load_quant
-from gptq.gptneox import load_quant as gptneox_load_quant
-from gptq.llama import load_quant as llama_load_quant
-from gptq.opt import load_quant as opt_load_quant
-from gptq.bigcode import load_quant as bigcode_load_quant
-from gptq.mpt import load_quant as mpt_load_quant
-from gptq.offload import load_quant_offload
-
-autogptq_support = True
-try:
-    import auto_gptq
-    from auto_gptq import AutoGPTQForCausalLM
-except ImportError:
-    autogptq_support = False
 
 
 model_backend_type = "GPTQ"
@@ -185,6 +162,15 @@ class model_backend(HFTorchInferenceModel):
         utils.koboldai_vars.modeldim = self.get_hidden_size()
 
     def _get_model(self, location: str, tf_kwargs: Dict):
+        import gptq
+        from gptq.gptj import load_quant as gptj_load_quant
+        from gptq.gptneox import load_quant as gptneox_load_quant
+        from gptq.llama import load_quant as llama_load_quant
+        from gptq.opt import load_quant as opt_load_quant
+        from gptq.bigcode import load_quant as bigcode_load_quant
+        from gptq.mpt import load_quant as mpt_load_quant
+        from gptq.offload import load_quant_offload
+
         gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
         v2_bias = False
 
@@ -207,7 +193,19 @@ class model_backend(HFTorchInferenceModel):
             model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
         elif model_type == "gpt_bigcode":
             model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
-        elif autogptq_support:
+        else:
+            try:
+                import auto_gptq
+                from auto_gptq import AutoGPTQForCausalLM
+            except ImportError:
+                raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
+
+            try:
+                import hf_bleeding_edge
+                from hf_bleeding_edge import AutoModelForCausalLM
+            except ImportError:
+                from transformers import AutoModelForCausalLM
+
             # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
             auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
             auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
@@ -227,12 +225,12 @@ class model_backend(HFTorchInferenceModel):
                     return self.model.generate(*args, **kwargs)
 
             type(model).generate = generate
-        else:
-            raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
 
         return model
 
     def _get_tokenizer(self, location: str):
+        from transformers import AutoTokenizer, LlamaTokenizer
+
         model_type = self.get_model_type()
         if model_type == "llama":
             tokenizer = LlamaTokenizer.from_pretrained(location)

From 73953068c0e0752094843e17151471056aa132f2 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 23 Jul 2023 22:12:31 +0200
Subject: [PATCH 113/113] Remove exllama backend, pending further fixes

---
 modeling/inference_models/exllama/class.py | 446 ---------------------
 1 file changed, 446 deletions(-)
 delete mode 100644 modeling/inference_models/exllama/class.py

diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
deleted file mode 100644
index e3c7a874..00000000
--- a/modeling/inference_models/exllama/class.py
+++ /dev/null
@@ -1,446 +0,0 @@
-from __future__ import annotations
-
-import time, json
-import torch
-import requests
-import numpy as np
-from typing import List, Optional, Union
-import os
-import glob
-from pathlib import Path
-import re
-import warnings
-import gc
-
-import utils
-from logger import logger
-
-from modeling import warpers
-from modeling.warpers import Warper
-from modeling.stoppers import Stoppers
-from modeling.post_token_hooks import PostTokenHooks
-from modeling.inference_model import (
-    GenerationResult,
-    GenerationSettings,
-    InferenceModel,
-    ModelCapabilities,
-)
-
-from modeling.tokenizer import GenericTokenizer
-
-from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
-from transformers import LlamaTokenizer
-from exllama.generator import ExLlamaGenerator
-
-model_backend_type = "GPTQ"
-model_backend_name = "ExLlama"
-
-# When set to true, messages will appear in the console if samplers are not
-# changing the scores. Keep in mind some samplers don't always change the
-# scores for each token.
-LOG_SAMPLER_NO_EFFECT = False
-
-
-def load_model_gptq_settings(path):
-    try:
-        js = json.load(open(path + "/config.json", "r"))
-    except Exception as e:
-        return False, False
-
-    gptq_model = False
-    gptq_file = False
-
-    gptq_legacy_files = glob.glob(os.path.join(path, "*4bit*.safetensors"))
-    if "gptq_bits" in js:
-        gptq_model = True
-        gptq_file = os.path.join(path, "model.safetensors")
-    elif gptq_legacy_files:
-        gptq_model = True
-        gptq_file = gptq_legacy_files[0]
-        fname = Path(gptq_file).parts[-1]
-        g = re.findall("(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
-
-    return gptq_model, gptq_file
-
-
-class model_backend(InferenceModel):
-    def __init__(self) -> None:
-        super().__init__()
-        self.model_config = None
-
-        self.model = None
-        self.tokenizer = None
-        self.cache = None
-        self.generator = None
-
-        self.model_name = ""
-        self.path = None
-
-        self.post_token_hooks = [
-            PostTokenHooks.stream_tokens,
-        ]
-
-        self.stopper_hooks = [
-            Stoppers.core_stopper,
-            Stoppers.dynamic_wi_scanner,
-            Stoppers.singleline_stopper,
-            Stoppers.chat_mode_stopper,
-            Stoppers.stop_sequence_stopper,
-        ]
-
-        self.capabilties = ModelCapabilities(
-            embedding_manipulation=False,
-            post_token_hooks=True,
-            stopper_hooks=False,
-            post_token_probs=False,
-        )
-
-    def is_valid(self, model_name, model_path, menu_path):
-        gptq_model, _ = load_model_gptq_settings(model_path)
-        try:
-            self.model_config = self._load_config(model_name, model_path)
-            return self.model_config and gptq_model
-        except:
-            return False
-
-    def get_local_model_path(self):
-        return self.path or os.path.join("models", self.model_name.replace("/", "_"))
-
-    def _load_config(self, model_name, model_path):
-        config = False
-        if model_path is not None and os.path.exists(model_path):
-            config = ExLlamaConfig(os.path.join(model_path, "config.json"))
-        if not config and os.path.exists("models/{}".format(model_name.replace('/', '_'))):
-            config = ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json"))
-
-        return config
-
-    def _load(self, save_model: bool, initial_load: bool) -> None:
-        self.model = self._get_model(self.get_local_model_path(), {})
-        self.tokenizer = self._get_tokenizer(self.get_local_model_path())
-
-        self.cache = ExLlamaCache(self.model)
-
-        self.generator = ExLlamaGenerator(self.model, self.tokenizer.tokenizer, self.cache)
-
-    def _post_load(self) -> None:
-        # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
-        self.tokenizer.add_bos_token = False
-
-        # HF transformers no longer supports decode_with_prefix_space
-        # We work around this by wrapping decode, encode, and __call__
-        # with versions that work around the 'prefix space' misfeature
-        # of sentencepiece.
-        vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size))
-        has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")}
-
-        # Wrap 'decode' with a method that always returns text starting with a space
-        # when the head token starts with a space. This is what 'decode_with_prefix_space'
-        # used to do, and we implement it using the same technique (building a cache of
-        # tokens that should have a prefix space, and then prepending a space if the first
-        # token is in this set.) We also work around a bizarre behavior in which decoding
-        # a single token 13 behaves differently than decoding a squence containing only [13].
-        original_decode = type(self.tokenizer.tokenizer).decode
-        def decode_wrapper(self, token_ids, *args, **kwargs):
-            first = None
-            # Note, the code below that wraps single-value token_ids in a list
-            # is to work around this wonky behavior:
-            #   >>> t.decode(13)
-            #   '<0x0A>'
-            #   >>> t.decode([13])
-            #   '\n'
-            # Not doing this causes token streaming to receive <0x0A> characters
-            # instead of newlines.
-            if isinstance(token_ids, int):
-                first = token_ids
-                token_ids = [first]
-            elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor
-                # Tensors don't support the Python standard of 'empty is False'
-                # and the special case of dimension 0 tensors also needs to be
-                # handled separately.
-                if token_ids.dim() == 0:
-                    first = int(token_ids.item())
-                    token_ids = [first]
-                elif len(token_ids) > 0:
-                    first = int(token_ids[0])
-            elif token_ids is not None and len(token_ids) > 0:
-                first = token_ids[0]
-            result = original_decode(self, token_ids, *args, **kwargs)
-            if first is not None and first in has_prefix_space:
-                result = " " + result
-            return result
-        # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it
-        object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer))
-
-        # Wrap encode and __call__ to work around the 'prefix space' misfeature also.
-        # The problem is that "Bob" at the start of text is encoded as if it is
-        # " Bob". This creates a problem because it means you can't split text, encode
-        # the pieces, concatenate the tokens, decode them, and get the original text back.
-        # The workaround is to prepend a known token that (1) starts with a space; and
-        # (2) is not the prefix of any other token. After searching through the vocab
-        # " ," (space comma) is the only token containing only printable ascii characters
-        # that fits this bill. By prepending ',' to the text, the original encode
-        # method always returns [1919, ...], where the tail of the sequence is the
-        # actual encoded result we want without the prefix space behavior.
-        original_encode = type(self.tokenizer.tokenizer).encode
-        def encode_wrapper(self, text, *args, **kwargs):
-            if type(text) is str:
-                text = ',' + text
-                result = original_encode(self, text, *args, **kwargs)
-                result = result[1:]
-            else:
-                result = original_encode(self, text, *args, **kwargs)
-            return result
-        object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer))
-
-        # Since 'encode' is documented as being deprecated, also override __call__.
-        # This doesn't appear to currently be used by KoboldAI, but doing so
-        # in case someone uses it in the future.
-        original_call = type(self.tokenizer.tokenizer).__call__
-        def call_wrapper(self, text, *args, **kwargs):
-            if type(text) is str:
-                text = ',' + text
-                result = original_call(self, text, *args, **kwargs)
-                result = result[1:]
-            else:
-                result = original_call(self, text, *args, **kwargs)
-            return result
-        object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
-
-    def unload(self):
-        self.model_config = None
-
-        self.model = None
-        self.tokenizer = None
-        self.cache = None
-        self.generator = None
-
-        self.model_name = ""
-        self.path = None
-
-        with torch.no_grad():
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated")
-                for tensor in gc.get_objects():
-                    try:
-                        if torch.is_tensor(tensor):
-                            tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype))
-                    except:
-                        pass
-        gc.collect()
-        try:
-            with torch.no_grad():
-                torch.cuda.empty_cache()
-        except:
-            pass
-
-    def _apply_warpers(
-        self, scores: torch.Tensor, input_ids: torch.Tensor
-    ) -> torch.Tensor:
-        warpers.update_settings()
-
-        if LOG_SAMPLER_NO_EFFECT:
-            pre = torch.Tensor(scores)
-
-        for sid in utils.koboldai_vars.sampler_order:
-            warper = Warper.from_id(sid)
-
-            if not warper.value_is_valid():
-                continue
-
-            if warper == warpers.RepetitionPenalty:
-                # Rep pen needs more data than other samplers
-                scores = warper.torch(scores, input_ids=input_ids)
-            else:
-                scores = warper.torch(scores)
-
-            assert scores is not None, f"Scores are None; warper '{warper}' is to blame"
-
-            if LOG_SAMPLER_NO_EFFECT:
-                if torch.equal(pre, scores):
-                    logger.info(warper, "had no effect on the scores.")
-                pre = torch.Tensor(scores)
-        return scores
-
-    def _raw_generate(
-        self,
-        prompt_tokens: Union[List[int], torch.Tensor],
-        max_new: int,
-        gen_settings: GenerationSettings,
-        single_line: bool = False,
-        batch_count: int = 1,
-        seed: Optional[int] = None,
-        **kwargs,
-    ) -> GenerationResult:
-        if seed:
-            torch.manual_seed(seed)
-
-        if not isinstance(prompt_tokens, torch.Tensor):
-            gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
-        else:
-            gen_in = prompt_tokens
-
-        self.generator.gen_begin_reuse(gen_in)
-
-        for i in range(max_new):
-            logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache)
-            logits[:, :, self.tokenizer.bos_token_id] = -10000.0
-
-            logits = torch.unsqueeze(logits[0, -1, :], 0)
-
-            scores = self._apply_warpers(logits, gen_in)
-
-            scores = torch.softmax(scores, dim=-1)
-
-            token = torch.multinomial(scores, 1)
-
-            self.generator.gen_accept_token(token)
-
-            self._post_token_gen(self.generator.sequence)
-
-            utils.koboldai_vars.generated_tkns += 1
-
-            if token.item() == self.tokenizer.eos_token_id: break
-
-        utils.koboldai_vars.generated_tkns = max_new
-
-        return GenerationResult(
-            model=self,
-            out_batches=np.array(
-                self.generator.sequence[:, gen_in.size(1):],
-            ),
-            prompt=prompt_tokens,
-            is_whole_generation=True,
-            single_line=single_line,
-        )
-
-    def _get_model(self, location: str, tf_kwargs: Dict):
-        if not self.model_config:
-            ExLlamaConfig(os.path.join(location, "config.json"))
-
-        _, self.model_config.model_path = load_model_gptq_settings(location)
-        # self.model_config.gpu_peer_fix = True
-        return ExLlama(self.model_config)
-
-    def _get_tokenizer(self, location: str):
-        tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location))
-        tokenizer._koboldai_header = tokenizer.encode("")
-        return tokenizer
-
-    def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
-        requested_parameters = []
-        gpu_count = torch.cuda.device_count()
-        layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
-        requested_parameters.append({
-                                        "uitype": "Valid Display",
-                                        "unit": "text",
-                                        "label": "Current Allocated Layers: %1/{}".format(layer_count), #%1 will be the validation value
-                                        "id": "valid_layers",
-                                        "max": layer_count,
-                                        "step": 1,
-                                        "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="},
-                                        "menu_path": "Layers",
-                                        "extra_classes": "",
-                                        "refresh_model_inputs": False
-                                    })
-        for i in range(gpu_count):
-            requested_parameters.append({
-                                            "uitype": "slider",
-                                            "unit": "int",
-                                            "label": "{} Layers".format(torch.cuda.get_device_name(i)),
-                                            "id": "{}_Layers".format(i),
-                                            "min": 0,
-                                            "max": layer_count,
-                                            "step": 1,
-                                            "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="},
-                                            "check_message": "The sum of assigned layers must equal {}".format(layer_count),
-                                            "default": [layer_count if i == 0 else 0],
-                                            "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)),
-                                            "menu_path": "Layers",
-                                            "extra_classes": "",
-                                            "refresh_model_inputs": False
-                                        })
-
-        requested_parameters.append({
-            "uitype": "slider",
-            "unit": "int",
-            "label": "Maximum Context",
-            "id": "max_ctx",
-            "min": 2048,
-            "max": 16384,
-            "step": 512,
-            "default": 2048,
-            "tooltip": "The maximum context size the model supports",
-            "menu_path": "Configuration",
-            "extra_classes": "",
-            "refresh_model_inputs": False
-        })
-
-        requested_parameters.append({
-            "uitype": "slider",
-            "unit": "float",
-            "label": "Embedding Compression",
-            "id": "compress_emb",
-            "min": 1,
-            "max": 8,
-            "step": 0.25,
-            "default": 1,
-            "tooltip": "If the model requires compressed embeddings, set them here",
-            "menu_path": "Configuration",
-            "extra_classes": "",
-            "refresh_model_inputs": False
-        })
-
-        requested_parameters.append({
-            "uitype": "slider",
-            "unit": "float",
-            "label": "NTK alpha",
-            "id": "ntk_alpha",
-            "min": 1,
-            "max": 32,
-            "step": 0.25,
-            "default": 1,
-            "tooltip": "NTK alpha value",
-            "menu_path": "Configuration",
-            "extra_classes": "",
-            "refresh_model_inputs": False
-        })
-
-        return requested_parameters
-
-    def set_input_parameters(self, parameters):
-        gpu_count = torch.cuda.device_count()
-        layers = []
-        for i in range(gpu_count):
-            if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric():
-                layers.append(int(parameters["{}_Layers".format(i)]))
-            elif isinstance(parameters["{}_Layers".format(i)], str):
-                 layers.append(None)
-            else:
-                layers.append(parameters["{}_Layers".format(i)])
-
-        self.layers = layers
-        self.model_config.device_map.layers = []
-        for i, l in enumerate(layers):
-            if l > 0:
-                self.model_config.device_map.layers.extend([f"cuda:{i}"] * l)
-        self.model_config.device_map.lm_head = "cuda:0"
-        self.model_config.device_map.norm = "cuda:0"
-
-        self.model_config.max_seq_len = parameters["max_ctx"]
-        self.model_config.compress_pos_emb = parameters["compress_emb"]
-        self.model_config.alpha_value = parameters["ntk_alpha"]
-        self.model_config.calculate_rotary_embedding_base()
-
-        # Disable half2 for HIP
-        self.model_config.rmsnorm_no_half2 = bool(torch.version.hip)
-        self.model_config.rope_no_half2 = bool(torch.version.hip)
-        self.model_config.matmul_no_half2 = bool(torch.version.hip)
-        self.model_config.silu_no_half2 = bool(torch.version.hip)
-
-        # Disable scaled_dot_product_attention if torch version < 2
-        if torch.__version__.startswith("1."):
-            self.model_config.sdp_thd = 0
-
-        self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
-        self.path = parameters['path'] if 'path' in parameters else None