From 8c9ed5540626655870b6c8e79b5a838f6f012a91 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Thu, 9 Mar 2023 22:36:45 -0500
Subject: [PATCH 01/47] Update aiserver.py

---
 aiserver.py | 63 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 13 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 7c60b04e..4174d1fa 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -87,6 +87,38 @@ from io import BytesIO
 
 global tpu_mtj_backend
 
+from transformers.models.llama.tokenization_llama import LLaMATokenizer
+from repos.gptq.gptq import *
+from repos.gptq.modelutils import *
+from repos.gptq.quant import *
+def load_quant(model, checkpoint, wbits):
+    from transformers import LLaMAConfig, LLaMAForCausalLM 
+    config = LLaMAConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop 
+    torch.nn.init.uniform_ = noop 
+    torch.nn.init.normal_ = noop 
+
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = LLaMAForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    make_quant(model, layers, wbits)
+
+    print('Loading model ...')
+    model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = 2048
+    print('Done.')
+
+    return model
+
 
 if lupa.LUA_VERSION[:2] != (5, 4):
     logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
@@ -2886,7 +2918,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                     @functools.lru_cache(maxsize=None)
                     def get_original_key(key):
-                        return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
+                        try:
+                            return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
+                        except ValueError:
+                            return key
 
                     for key, value in model_dict.items():
                         original_key = get_original_key(key)
@@ -3083,22 +3118,24 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(koboldai_vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                         lowmem = {}
                     if(os.path.isdir(koboldai_vars.custmodpth)):
+                        tokenizer = LLaMATokenizer.from_pretrained(koboldai_vars.custmodpth)
+                        # try:
+                        #     tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
+                        # except Exception as e:
+                        #     try:
+                        #         tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
+                        #     except Exception as e:
+                        #         try:
+                        #             tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
+                        #         except Exception as e:
+                        #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
                         try:
-                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
-                        except Exception as e:
-                            try:
-                                tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
-                            except Exception as e:
-                                try:
-                                    tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
-                                except Exception as e:
-                                    tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
-                        try:
-                            model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                            # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4)
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
-                            model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                            # model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
                     elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)

From dcf9d37a00dc582618f10deef6d226f77018dc16 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Fri, 10 Mar 2023 00:01:40 -0500
Subject: [PATCH 02/47] It just works.

---
 aiserver.py | 46 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 4174d1fa..66aa7362 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1142,9 +1142,9 @@ def move_model_to_devices(model):
 
     if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel):
         if(koboldai_vars.usegpu):
-            model = model.half().to(koboldai_vars.gpu_device)
+            model = model.to(koboldai_vars.gpu_device)
         else:
-            model = model.to('cpu').float()
+            model = model.to('cpu')
         generator = model.generate
         return
 
@@ -1172,7 +1172,6 @@ def move_model_to_devices(model):
         generator = model.generate
         return
 
-    model.half()
     gc.collect()
 
     if(hasattr(model, "transformer")):
@@ -2983,10 +2982,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
                                 #print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
                                 model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
-                                if model_dict[key].dtype is torch.float32:
-                                    koboldai_vars.fp32_model = True
-                                if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
-                                    model_dict[key] = model_dict[key].to(torch.float16)
+                                # if model_dict[key].dtype is torch.float32:
+                                #     koboldai_vars.fp32_model = True
+                                # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
+                                #     model_dict[key] = model_dict[key].to(torch.float16)
                                 if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
                                     model_dict[key] = model_dict[key].to(torch.float32)
                                 if device == "shared":
@@ -3010,16 +3009,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 if utils.offload_index:
                                     for name, tensor in utils.named_buffers:
                                         dtype = tensor.dtype
-                                        if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
-                                            dtype = torch.float16
-                                        if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
-                                            dtype = torch.float32
-                                        if name in model_dict and model_dict[name].dtype is not dtype:
-                                            model_dict[name] = model_dict[name].to(dtype)
-                                        if tensor.dtype is not dtype:
-                                            tensor = tensor.to(dtype)
-                                        if name not in utils.offload_index:
-                                            accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
+                                        # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
+                                        #     dtype = torch.float16
+                                        # if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
+                                        #     dtype = torch.float32
+                                        # if name in model_dict and model_dict[name].dtype is not dtype:
+                                        #     model_dict[name] = model_dict[name].to(dtype)
+                                        # if tensor.dtype is not dtype:
+                                        #     tensor = tensor.to(dtype)
+                                        # if name not in utils.offload_index:
+                                        #     accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
                                     accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
                                 utils.bar.close()
                                 utils.bar = None
@@ -3078,10 +3077,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 koboldai_vars.modeldim = get_hidden_size_from_model(model)
                 # Is CUDA available? If so, use GPU, otherwise fall back to CPU
                 if(koboldai_vars.hascuda and koboldai_vars.usegpu):
-                    model = model.half().to(koboldai_vars.gpu_device)
+                    model = model.to(koboldai_vars.gpu_device)
                     generator = model.generate
                 else:
-                    model = model.to('cpu').float()
+                    model = model.to('cpu')
                     generator = model.generate
                 patch_causallm(model)
             # Use the Generic implementation
@@ -3131,7 +3130,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
                         try:
                             # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4)
+                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
@@ -3190,7 +3189,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             import shutil
                             tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
                             if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)):  # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case
-                                model = model.half()
                                 model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
                             else:  # For fp16 models, we can just copy the model files directly
                                 import transformers.configuration_utils
@@ -3224,7 +3222,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 if(koboldai_vars.hascuda):
                     if(koboldai_vars.usegpu):
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
-                        model = model.half().to(koboldai_vars.gpu_device)
+                        model = model.to(koboldai_vars.gpu_device)
                         generator = model.generate
                     elif(koboldai_vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
@@ -3236,7 +3234,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         generator = model.generate
                     else:
-                        model = model.to('cpu').float()
+                        model = model.to('cpu')
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         generator = model.generate
                 elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
@@ -3244,7 +3242,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     koboldai_vars.modeldim = get_hidden_size_from_model(model)
                     generator = model.generate
                 else:
-                    model.to('cpu').float()
+                    model.to('cpu')
                     koboldai_vars.modeldim = get_hidden_size_from_model(model)
                     generator = model.generate
             

From 3f132ce45ba61f30015147bb0d9ba26647204332 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Fri, 10 Mar 2023 03:26:09 -0500
Subject: [PATCH 03/47] Notify if LLAMA_4BIT env var not set

---
 aiserver.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 66aa7362..399ce434 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3130,7 +3130,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
                         try:
                             # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
+                            if os.environ.get('LLAMA_4BIT') is not None:
+                                model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
+                            else:
+                                raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before procedding.")
+                                exit(1)
+
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")

From 1808b0d2eca42e30bee6edd6896744cfd6995ffc Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Sat, 11 Mar 2023 12:13:22 -0500
Subject: [PATCH 04/47] Another safety check for if model is not loaded

---
 aiserver.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 399ce434..3ec8f284 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3133,13 +3133,17 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             if os.environ.get('LLAMA_4BIT') is not None:
                                 model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
                             else:
-                                raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before procedding.")
+                                raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
                                 exit(1)
 
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
                             # model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+			
+			if model is None:
+				raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load. Exiting.")
+				exit(1)
                     elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)

From bde31217f164a3aadc4282913012378a886d6058 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Sat, 11 Mar 2023 12:15:58 -0500
Subject: [PATCH 05/47] improve model None check

---
 aiserver.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 3ec8f284..c14ac730 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3134,16 +3134,14 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
                             else:
                                 raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
-                                exit(1)
+
+                            if model is None:
+                                raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")
 
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
                             # model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-			
-			if model is None:
-				raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load. Exiting.")
-				exit(1)
                     elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)

From b3b454bbe4b4a479ec5703b99487bf00906975ac Mon Sep 17 00:00:00 2001
From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com>
Date: Wed, 15 Mar 2023 00:03:43 -0500
Subject: [PATCH 06/47] Update huggingface.yml

---
 environments/huggingface.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 222bb6ad..26e7e670 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -29,7 +29,8 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - transformers==4.25.1
+    - git+https://github.com/zphang/transformers@llama_push
+    - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
     - huggingface_hub==0.12.1
     - safetensors
     - accelerate

From 5d17692c79a3642b7e1ae1c37e262cd47f449356 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Thu, 16 Mar 2023 05:19:47 +0000
Subject: [PATCH 07/47] Remove except Exception so that errors actually show up

---
 aiserver.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 77e31b63..40d9a4ba 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3128,20 +3128,15 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         #             tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
                         #         except Exception as e:
                         #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
-                        try:
-                            # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-                            if os.environ.get('LLAMA_4BIT') is not None:
-                                model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
-                            else:
-                                raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
+                        # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
 
-                            if model is None:
-                                raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")
+                        if os.environ.get('LLAMA_4BIT'):
+                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
+                        else:
+                            raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
 
-                        except Exception as e:
-                            if("out of memory" in traceback.format_exc().lower()):
-                                raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
-                            # model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                        if model is None:
+                            raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")
                     elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)

From 60acf593160ce86118286ab0fa5c4ce082ddc52c Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 19 Mar 2023 21:19:02 +0000
Subject: [PATCH 08/47] Improve 4-bit llama support, add 4-bit gptj and gptneox
 support

---
 aiserver.py | 86 +++++++++++++++++++++++++----------------------------
 1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 40d9a4ba..96ea7490 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -87,37 +87,14 @@ from io import BytesIO
 
 global tpu_mtj_backend
 
-from transformers.models.llama.tokenization_llama import LLaMATokenizer
-from repos.gptq.gptq import *
-from repos.gptq.modelutils import *
-from repos.gptq.quant import *
-def load_quant(model, checkpoint, wbits):
-    from transformers import LLaMAConfig, LLaMAForCausalLM 
-    config = LLaMAConfig.from_pretrained(model)
-    def noop(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = noop 
-    torch.nn.init.uniform_ = noop 
-    torch.nn.init.normal_ = noop 
 
-    torch.set_default_dtype(torch.half)
-    transformers.modeling_utils._init_weights = False
-    torch.set_default_dtype(torch.half)
-    model = LLaMAForCausalLM(config)
-    torch.set_default_dtype(torch.float)
-    model = model.eval()
-    layers = find_layers(model)
-    for name in ['lm_head']:
-        if name in layers:
-            del layers[name]
-    make_quant(model, layers, wbits)
-
-    print('Loading model ...')
-    model.load_state_dict(torch.load(checkpoint))
-    model.seqlen = 2048
-    print('Done.')
-
-    return model
+# 4-bit dependencies
+from pathlib import Path
+sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
+from gptj import load_quant as gptj_load_quant
+from gptneox import load_quant as gptneox_load_quant
+from llama import load_quant as llama_load_quant
+vars_4bit = {}
 
 
 if lupa.LUA_VERSION[:2] != (5, 4):
@@ -1541,6 +1518,11 @@ def general_startup(override_args=None):
     parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen")
     parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen")
 
+    # 4-bit stuff
+    parser.add_argument('--gptj4bit', help="Load a GPT-J model 4-bit pt file with this path")
+    parser.add_argument('--gptneox4bit', help="Load a GPT-NeoX model 4-bit pt file with this path")
+    parser.add_argument('--llama4bit', help="Load a Llama model 4-bit pt file with this path")
+
     #args: argparse.Namespace = None
     if "pytest" in sys.modules and override_args is None:
         args = parser.parse_args([])
@@ -1644,6 +1626,11 @@ def general_startup(override_args=None):
     koboldai_vars.smanrename = koboldai_vars.host == args.override_rename
 
     koboldai_vars.aria2_port = args.aria2_port or 6799
+
+    global vars_4bit
+    vars_4bit["gptj4bit"] = args.gptj4bit
+    vars_4bit["gptneox4bit"] = args.gptneox4bit
+    vars_4bit["llama4bit"] = args.llama4bit
     
     #Now let's look to see if we are going to force a load of a model from a user selected folder
     if(koboldai_vars.model == "selectfolder"):
@@ -2971,7 +2958,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                     try:
                                         f = z.open(f"archive/data/{storage_key}")
                                     except:
-                                        f = z.open(f"{zipfolder}/data/{storage_key}")
+                                        ziproot = z.namelist()[0].split(os.sep)[0]
+                                        f = z.open(f"{ziproot}/data/{storage_key}")
                                     current_offset = 0
                                 if current_offset != model_dict[key].seek_offset:
                                     f.read(model_dict[key].seek_offset - current_offset)
@@ -3117,23 +3105,29 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(koboldai_vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                         lowmem = {}
                     if(os.path.isdir(koboldai_vars.custmodpth)):
-                        tokenizer = LLaMATokenizer.from_pretrained(koboldai_vars.custmodpth)
-                        # try:
-                        #     tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
-                        # except Exception as e:
-                        #     try:
-                        #         tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
-                        #     except Exception as e:
-                        #         try:
-                        #             tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
-                        #         except Exception as e:
-                        #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
-                        # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                        global vars_4bit
 
-                        if os.environ.get('LLAMA_4BIT'):
-                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
+                        if vars_4bit.get("gptj4bit"):
+                            model = gptj_load_quant(koboldai_vars.custmodpth, vars_4bit["gptj4bit"], 4)
+                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                        elif vars_4bit.get("gptneox4bit"):
+                            model = gptneox_load_quant(koboldai_vars.custmodpth, vars_4bit["gptneox4bit"], 4)
+                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                        elif vars_4bit.get("llama4bit"):
+                            model = llama_load_quant(koboldai_vars.custmodpth, vars_4bit["llama4bit"], 4)
+                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                         else:
-                            raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
+                            try:
+                                tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
+                            except Exception as e:
+                                try:
+                                    tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
+                                except Exception as e:
+                                    try:
+                                        tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
+                                    except Exception as e:
+                                        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
+                            model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
 
                         if model is None:
                             raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")

From 858657f6691933ad3660660001837491b7ba4ae6 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 20 Mar 2023 09:16:30 +0100
Subject: [PATCH 09/47] Fix zipfile folder identification fix for Windows

---
 aiserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 96ea7490..4558ce3d 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2958,7 +2958,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                     try:
                                         f = z.open(f"archive/data/{storage_key}")
                                     except:
-                                        ziproot = z.namelist()[0].split(os.sep)[0]
+                                        ziproot = z.namelist()[0].split("/")[0]
                                         f = z.open(f"{ziproot}/data/{storage_key}")
                                     current_offset = 0
                                 if current_offset != model_dict[key].seek_offset:

From 4cfc1219d449ebc92205eed15f0ffc1b133db708 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 20 Mar 2023 19:13:46 +0000
Subject: [PATCH 10/47] Add gptq as submodule

---
 .gitmodules | 4 ++++
 repos/gptq  | 1 +
 2 files changed, 5 insertions(+)
 create mode 160000 repos/gptq

diff --git a/.gitmodules b/.gitmodules
index 0107a8c3..c6f4b308 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,7 @@
 [submodule "KoboldAI-Horde-Bridge"]
 	path = KoboldAI-Horde-Bridge
 	url = https://github.com/db0/KoboldAI-Horde-Bridge
+[submodule "repos/gptq"]
+	path = repos/gptq
+	url = https://github.com/0cc4m/GPTQ-for-LLaMa
+	branch = a8303654c200c25577130466e5f9bc1e70fc8a50
diff --git a/repos/gptq b/repos/gptq
new file mode 160000
index 00000000..a8303654
--- /dev/null
+++ b/repos/gptq
@@ -0,0 +1 @@
+Subproject commit a8303654c200c25577130466e5f9bc1e70fc8a50

From ecd065a881d40996558ff07d0e2bfdbdf255e777 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 21 Mar 2023 21:40:59 +0000
Subject: [PATCH 11/47] Overhaul 4-bit support to load with a toggle

---
 aiserver.py           | 145 +++++++++++++++++++++++++++---------------
 koboldai_settings.py  |   6 +-
 static/koboldai.js    |  32 +++++++++-
 templates/popups.html |   6 +-
 4 files changed, 130 insertions(+), 59 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index f58d949a..7497dfb9 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -70,7 +70,7 @@ from utils import debounce
 import utils
 import koboldai_settings
 import torch
-from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, modeling_utils, AutoModelForTokenClassification
+from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, modeling_utils, AutoModelForTokenClassification, LlamaTokenizer
 from transformers import __version__ as transformers_version
 import transformers
 try:
@@ -1114,14 +1114,20 @@ def device_config(config):
         koboldai_vars.usegpu = False
         return
 
-def move_model_to_devices(model):
+def move_model_to_devices(model, use_4_bit=False):
     global generator
 
     if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel):
         if(koboldai_vars.usegpu):
-            model = model.to(koboldai_vars.gpu_device)
+            if not use_4_bit:
+                model = model.half().to(koboldai_vars.gpu_device)
+            else:
+                model = model.to(koboldai_vars.gpu_device)
         else:
-            model = model.to('cpu')
+            if not use_4_bit:
+                model = model.to('cpu').float()
+            else:
+                model = model.to('cpu')
         generator = model.generate
         return
 
@@ -1149,6 +1155,8 @@ def move_model_to_devices(model):
         generator = model.generate
         return
 
+    if not use_4_bit:
+        model.half()
     gc.collect()
 
     if(hasattr(model, "transformer")):
@@ -1518,11 +1526,6 @@ def general_startup(override_args=None):
     parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen")
     parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen")
 
-    # 4-bit stuff
-    parser.add_argument('--gptj4bit', help="Load a GPT-J model 4-bit pt file with this path")
-    parser.add_argument('--gptneox4bit', help="Load a GPT-NeoX model 4-bit pt file with this path")
-    parser.add_argument('--llama4bit', help="Load a Llama model 4-bit pt file with this path")
-
     #args: argparse.Namespace = None
     if "pytest" in sys.modules and override_args is None:
         args = parser.parse_args([])
@@ -1626,11 +1629,6 @@ def general_startup(override_args=None):
     koboldai_vars.smanrename = koboldai_vars.host == args.override_rename
 
     koboldai_vars.aria2_port = args.aria2_port or 6799
-
-    global vars_4bit
-    vars_4bit["gptj4bit"] = args.gptj4bit
-    vars_4bit["gptneox4bit"] = args.gptneox4bit
-    vars_4bit["llama4bit"] = args.llama4bit
     
     #Now let's look to see if we are going to force a load of a model from a user selected folder
     if(koboldai_vars.model == "selectfolder"):
@@ -1777,6 +1775,7 @@ def get_model_info(model, directory=""):
                          'break_values': break_values, 'gpu_count': gpu_count,
                          'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select,
                          'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False,
+                         'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False,
                          'show_custom_model_box': show_custom_model_box})
     if send_horde_models:
         get_cluster_models({'key': key_value, 'url': default_url})
@@ -1918,6 +1917,18 @@ def get_cluster_models(msg):
     emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2")
 
 
+@socketio.on("use_4_bit_toggle")
+def use_4_bit_toggle(msg):
+    # Disable lazy_load and breakmodel
+    if msg["use_4_bit"]:
+        koboldai_vars.lazy_load = False
+        koboldai_vars.nobreakmodel = True
+    else:
+        koboldai_vars.lazy_load = True
+        koboldai_vars.nobreakmodel = False
+
+    # TODO: Reload JS values for this stuff
+
 # Function to patch transformers to use our soft prompt
 def patch_causallm(model):
     from torch.nn import Embedding
@@ -2647,7 +2658,7 @@ def unload_model():
     koboldai_vars.badwordsids = koboldai_settings.badwordsids_default
     
     
-def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False):
+def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
     global model
     global generator
     global torch
@@ -2684,7 +2695,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
         disk_layers = args.breakmodel_disklayers
     if breakmodel_args_default_to_cpu and disk_layers is None:
         disk_layers = args.breakmodel_disklayers = 0
-    
+
     unload_model()
     
     if online_model == "":
@@ -2904,10 +2915,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                     @functools.lru_cache(maxsize=None)
                     def get_original_key(key):
-                        try:
-                            return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
-                        except ValueError:
-                            return key
+                        # try:
+                        return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
+                        # except ValueError:
+                        #     return key
 
                     for key, value in model_dict.items():
                         original_key = get_original_key(key)
@@ -2970,10 +2981,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
                                 #print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
                                 model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
-                                # if model_dict[key].dtype is torch.float32:
-                                #     koboldai_vars.fp32_model = True
-                                # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
-                                #     model_dict[key] = model_dict[key].to(torch.float16)
+                                if not use_4_bit:
+                                    if model_dict[key].dtype is torch.float32:
+                                        koboldai_vars.fp32_model = True
+                                    if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
+                                        model_dict[key] = model_dict[key].to(torch.float16)
                                 if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
                                     model_dict[key] = model_dict[key].to(torch.float32)
                                 if device == "shared":
@@ -2997,16 +3009,17 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 if utils.offload_index:
                                     for name, tensor in utils.named_buffers:
                                         dtype = tensor.dtype
-                                        # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
-                                        #     dtype = torch.float16
-                                        # if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
-                                        #     dtype = torch.float32
-                                        # if name in model_dict and model_dict[name].dtype is not dtype:
-                                        #     model_dict[name] = model_dict[name].to(dtype)
-                                        # if tensor.dtype is not dtype:
-                                        #     tensor = tensor.to(dtype)
-                                        # if name not in utils.offload_index:
-                                        #     accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
+                                        if not use_4_bit:
+                                            if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
+                                                dtype = torch.float16
+                                            if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
+                                                dtype = torch.float32
+                                            if name in model_dict and model_dict[name].dtype is not dtype:
+                                                model_dict[name] = model_dict[name].to(dtype)
+                                            if tensor.dtype is not dtype:
+                                                tensor = tensor.to(dtype)
+                                            if name not in utils.offload_index:
+                                                accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
                                     accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
                                 utils.bar.close()
                                 utils.bar = None
@@ -3065,10 +3078,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 koboldai_vars.modeldim = get_hidden_size_from_model(model)
                 # Is CUDA available? If so, use GPU, otherwise fall back to CPU
                 if(koboldai_vars.hascuda and koboldai_vars.usegpu):
-                    model = model.to(koboldai_vars.gpu_device)
+                    if not use_4_bit:
+                        model = model.half().to(koboldai_vars.gpu_device)
+                    else:
+                        model = model.to(koboldai_vars.gpu_device)
                     generator = model.generate
                 else:
-                    model = model.to('cpu')
+                    if not use_4_bit:
+                        model = model.to('cpu').float()
+                    else:
+                        model = model.to('cpu')
                     generator = model.generate
                 patch_causallm(model)
             # Use the Generic implementation
@@ -3105,17 +3124,26 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(koboldai_vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                         lowmem = {}
                     if(os.path.isdir(koboldai_vars.custmodpth)):
-                        global vars_4bit
 
-                        if vars_4bit.get("gptj4bit"):
-                            model = gptj_load_quant(koboldai_vars.custmodpth, vars_4bit["gptj4bit"], 4)
-                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
-                        elif vars_4bit.get("gptneox4bit"):
-                            model = gptneox_load_quant(koboldai_vars.custmodpth, vars_4bit["gptneox4bit"], 4)
-                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
-                        elif vars_4bit.get("llama4bit"):
-                            model = llama_load_quant(koboldai_vars.custmodpth, vars_4bit["llama4bit"], 4)
-                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                        path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
+
+                        if not os.path.isfile(path_4bit):
+                            print(f"4-bit file {path_4bit} not found, aborting 4-bit load")
+                            use_4_bit = False
+
+                        if use_4_bit:
+                            print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
+                            if koboldai_vars.model_type == "gptj":
+                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                            elif koboldai_vars.model_type == "gpt_neox":
+                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                            elif koboldai_vars.model_type == "llama":
+                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                            else:
+                                raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")
                         else:
                             try:
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
@@ -3185,6 +3213,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             import shutil
                             tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
                             if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)):  # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case
+                                if not use_4_bit:
+                                    model = model.half()
                                 model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
                             else:  # For fp16 models, we can just copy the model files directly
                                 import transformers.configuration_utils
@@ -3218,27 +3248,36 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 if(koboldai_vars.hascuda):
                     if(koboldai_vars.usegpu):
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
-                        model = model.to(koboldai_vars.gpu_device)
+                        if not use_4_bit:
+                            model = model.half().to(koboldai_vars.gpu_device)
+                        else:
+                            model = model.to(koboldai_vars.gpu_device)
                         generator = model.generate
                     elif(koboldai_vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         if(not koboldai_vars.lazy_load):
                             device_config(model.config)
-                        move_model_to_devices(model)
+                        move_model_to_devices(model, use_4_bit)
                     elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
-                        move_model_to_devices(model)
+                        move_model_to_devices(model, use_4_bit)
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         generator = model.generate
                     else:
-                        model = model.to('cpu')
+                        if not use_4_bit:
+                            model.to('cpu').float()
+                        else:
+                            model.to('cpu')
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         generator = model.generate
                 elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
-                    move_model_to_devices(model)
+                    move_model_to_devices(model, use_4_bit)
                     koboldai_vars.modeldim = get_hidden_size_from_model(model)
                     generator = model.generate
                 else:
-                    model.to('cpu')
+                    if not use_4_bit:
+                        model.to('cpu').float()
+                    else:
+                        model.to('cpu')
                     koboldai_vars.modeldim = get_hidden_size_from_model(model)
                     generator = model.generate
             
@@ -8784,7 +8823,7 @@ def UI_2_load_model(data):
     koboldai_vars.model = data['model']
     koboldai_vars.custmodpth = data['path']
     print("loading Model")
-    load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'])
+    load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'], use_4_bit=data['use_4_bit'])
 
 #==================================================================#
 # Event triggered when load story is clicked
diff --git a/koboldai_settings.py b/koboldai_settings.py
index 95caec0c..16cc8128 100644
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -1207,7 +1207,7 @@ class system_settings(settings):
                          'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 
                          'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 
                          'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model',
-                         'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states']
+                         'generating_image', 'bit_8_available', 'bit_4_available', 'host', 'hascuda', 'usegpu', 'rng_states']
     settings_name = "system"
     def __init__(self, socketio, koboldai_var):
         self._socketio = socketio
@@ -1302,6 +1302,8 @@ class system_settings(settings):
                     elif torch.cuda.get_device_properties(device).major == 7 and torch.cuda.get_device_properties(device).minor >= 2:
                         self.bit_8_available = True
                         break
+        # Check if repos/gptq exists for 4-bit mode
+        self.bit_4_available = os.path.isdir("repos/gptq")
         self.seen_messages = []
         
         
@@ -2744,4 +2746,4 @@ default_preset = {
         ]
     }
 badwordsids_default = [[6880], [50256], [42496], [4613], [17414], [22039], [16410], [27], [29], [38430], [37922], [15913], [24618], [28725], [58], [47175], [36937], [26700], [12878], [16471], [37981], [5218], [29795], [13412], [45160], [3693], [49778], [4211], [20598], [36475], [33409], [44167], [32406], [29847], [29342], [42669], [685], [25787], [7359], [3784], [5320], [33994], [33490], [34516], [43734], [17635], [24293], [9959], [23785], [21737], [28401], [18161], [26358], [32509], [1279], [38155], [18189], [26894], [6927], [14610], [23834], [11037], [14631], [26933], [46904], [22330], [25915], [47934], [38214], [1875], [14692], [41832], [13163], [25970], [29565], [44926], [19841], [37250], [49029], [9609], [44438], [16791], [17816], [30109], [41888], [47527], [42924], [23984], [49074], [33717], [31161], [49082], [30138], [31175], [12240], [14804], [7131], [26076], [33250], [3556], [38381], [36338], [32756], [46581], [17912], [49146]] # Tokenized array of badwords used to prevent AI artifacting
-badwordsids_neox = [[0], [1], [44162], [9502], [12520], [31841], [36320], [49824], [34417], [6038], [34494], [24815], [26635], [24345], [3455], [28905], [44270], [17278], [32666], [46880], [7086], [43189], [37322], [17778], [20879], [49821], [3138], [14490], [4681], [21391], [26786], [43134], [9336], [683], [48074], [41256], [19181], [29650], [28532], [36487], [45114], [46275], [16445], [15104], [11337], [1168], [5647], [29], [27482], [44965], [43782], [31011], [42944], [47389], [6334], [17548], [38329], [32044], [35487], [2239], [34761], [7444], [1084], [12399], [18990], [17636], [39083], [1184], [35830], [28365], [16731], [43467], [47744], [1138], [16079], [40116], [45564], [18297], [42368], [5456], [18022], [42696], [34476], [23505], [23741], [39334], [37944], [45382], [38709], [33440], [26077], [43600], [34418], [36033], [6660], [48167], [48471], [15775], [19884], [41533], [1008], [31053], [36692], [46576], [20095], [20629], [31759], [46410], [41000], [13488], [30952], [39258], [16160], [27655], [22367], [42767], [43736], [49694], [13811], [12004], [46768], [6257], [37471], [5264], [44153], [33805], [20977], [21083], [25416], [14277], [31096], [42041], [18331], [33376], [22372], [46294], [28379], [38475], [1656], [5204], [27075], [50001], [16616], [11396], [7748], [48744], [35402], [28120], [41512], [4207], [43144], [14767], [15640], [16595], [41305], [44479], [38958], [18474], [22734], [30522], [46267], [60], [13976], [31830], [48701], [39822], [9014], [21966], [31422], [28052], [34607], [2479], [3851], [32214], [44082], [45507], [3001], [34368], [34758], [13380], [38363], [4299], [46802], [30996], [12630], [49236], [7082], [8795], [5218], [44740], [9686], [9983], [45301], [27114], [40125], [1570], [26997], [544], [5290], [49193], [23781], [14193], [40000], [2947], [43781], [9102], [48064], [42274], [18772], [49384], [9884], [45635], [43521], [31258], [32056], [47686], [21760], [13143], [10148], [26119], [44308], [31379], [36399], [23983], [46694], [36134], [8562], [12977], [35117], [28591], [49021], [47093], [28653], [29013], [46468], [8605], [7254], [25896], [5032], [8168], [36893], [38270], [20499], [27501], [34419], [29547], [28571], [36586], [20871], [30537], [26842], [21375], [31148], [27618], [33094], [3291], [31789], [28391], [870], [9793], [41361], [47916], [27468], [43856], [8850], [35237], [15707], [47552], [2730], [41449], [45488], [3073], [49806], [21938], [24430], [22747], [20924], [46145], [20481], [20197], [8239], [28231], [17987], [42804], [47269], [29972], [49884], [21382], [46295], [36676], [34616], [3921], [26991], [27720], [46265], [654], [9855], [40354], [5291], [34904], [44342], [2470], [14598], [880], [19282], [2498], [24237], [21431], [16369], [8994], [44524], [45662], [13663], [37077], [1447], [37786], [30863], [42854], [1019], [20322], [4398], [12159], [44072], [48664], [31547], [18736], [9259], [31], [16354], [21810], [4357], [37982], [5064], [2033], [32871], [47446], [62], [22158], [37387], [8743], [47007], [17981], [11049], [4622], [37916], [36786], [35138], [29925], [14157], [18095], [27829], [1181], [22226], [5709], [4725], [30189], [37014], [1254], [11380], [42989], [696], [24576], [39487], [30119], [1092], [8088], [2194], [9899], [14412], [21828], [3725], [13544], [5180], [44679], [34398], [3891], [28739], [14219], [37594], [49550], [11326], [6904], [17266], [5749], [10174], [23405], [9955], [38271], [41018], [13011], [48392], [36784], [24254], [21687], [23734], [5413], [41447], [45472], [10122], [17555], [15830], [47384], [12084], [31350], [47940], [11661], [27988], [45443], [905], [49651], [16614], [34993], [6781], [30803], [35869], [8001], [41604], [28118], [46462], [46762], [16262], [17281], [5774], [10943], [5013], [18257], [6750], [4713], [3951], [11899], [38791], [16943], [37596], [9318], [18413], [40473], [13208], [16375]]
\ No newline at end of file
+badwordsids_neox = [[0], [1], [44162], [9502], [12520], [31841], [36320], [49824], [34417], [6038], [34494], [24815], [26635], [24345], [3455], [28905], [44270], [17278], [32666], [46880], [7086], [43189], [37322], [17778], [20879], [49821], [3138], [14490], [4681], [21391], [26786], [43134], [9336], [683], [48074], [41256], [19181], [29650], [28532], [36487], [45114], [46275], [16445], [15104], [11337], [1168], [5647], [29], [27482], [44965], [43782], [31011], [42944], [47389], [6334], [17548], [38329], [32044], [35487], [2239], [34761], [7444], [1084], [12399], [18990], [17636], [39083], [1184], [35830], [28365], [16731], [43467], [47744], [1138], [16079], [40116], [45564], [18297], [42368], [5456], [18022], [42696], [34476], [23505], [23741], [39334], [37944], [45382], [38709], [33440], [26077], [43600], [34418], [36033], [6660], [48167], [48471], [15775], [19884], [41533], [1008], [31053], [36692], [46576], [20095], [20629], [31759], [46410], [41000], [13488], [30952], [39258], [16160], [27655], [22367], [42767], [43736], [49694], [13811], [12004], [46768], [6257], [37471], [5264], [44153], [33805], [20977], [21083], [25416], [14277], [31096], [42041], [18331], [33376], [22372], [46294], [28379], [38475], [1656], [5204], [27075], [50001], [16616], [11396], [7748], [48744], [35402], [28120], [41512], [4207], [43144], [14767], [15640], [16595], [41305], [44479], [38958], [18474], [22734], [30522], [46267], [60], [13976], [31830], [48701], [39822], [9014], [21966], [31422], [28052], [34607], [2479], [3851], [32214], [44082], [45507], [3001], [34368], [34758], [13380], [38363], [4299], [46802], [30996], [12630], [49236], [7082], [8795], [5218], [44740], [9686], [9983], [45301], [27114], [40125], [1570], [26997], [544], [5290], [49193], [23781], [14193], [40000], [2947], [43781], [9102], [48064], [42274], [18772], [49384], [9884], [45635], [43521], [31258], [32056], [47686], [21760], [13143], [10148], [26119], [44308], [31379], [36399], [23983], [46694], [36134], [8562], [12977], [35117], [28591], [49021], [47093], [28653], [29013], [46468], [8605], [7254], [25896], [5032], [8168], [36893], [38270], [20499], [27501], [34419], [29547], [28571], [36586], [20871], [30537], [26842], [21375], [31148], [27618], [33094], [3291], [31789], [28391], [870], [9793], [41361], [47916], [27468], [43856], [8850], [35237], [15707], [47552], [2730], [41449], [45488], [3073], [49806], [21938], [24430], [22747], [20924], [46145], [20481], [20197], [8239], [28231], [17987], [42804], [47269], [29972], [49884], [21382], [46295], [36676], [34616], [3921], [26991], [27720], [46265], [654], [9855], [40354], [5291], [34904], [44342], [2470], [14598], [880], [19282], [2498], [24237], [21431], [16369], [8994], [44524], [45662], [13663], [37077], [1447], [37786], [30863], [42854], [1019], [20322], [4398], [12159], [44072], [48664], [31547], [18736], [9259], [31], [16354], [21810], [4357], [37982], [5064], [2033], [32871], [47446], [62], [22158], [37387], [8743], [47007], [17981], [11049], [4622], [37916], [36786], [35138], [29925], [14157], [18095], [27829], [1181], [22226], [5709], [4725], [30189], [37014], [1254], [11380], [42989], [696], [24576], [39487], [30119], [1092], [8088], [2194], [9899], [14412], [21828], [3725], [13544], [5180], [44679], [34398], [3891], [28739], [14219], [37594], [49550], [11326], [6904], [17266], [5749], [10174], [23405], [9955], [38271], [41018], [13011], [48392], [36784], [24254], [21687], [23734], [5413], [41447], [45472], [10122], [17555], [15830], [47384], [12084], [31350], [47940], [11661], [27988], [45443], [905], [49651], [16614], [34993], [6781], [30803], [35869], [8001], [41604], [28118], [46462], [46762], [16262], [17281], [5774], [10943], [5013], [18257], [6750], [4713], [3951], [11899], [38791], [16943], [37596], [9318], [18413], [40473], [13208], [16375]]
diff --git a/static/koboldai.js b/static/koboldai.js
index cce66f80..05dcc47e 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -1472,6 +1472,7 @@ function show_model_menu(data) {
 	document.getElementById("modelurl").classList.add("hidden");
 	document.getElementById("use_gpu_div").classList.add("hidden");
 	document.getElementById("use_8_bit_div").classList.add("hidden");
+	document.getElementById("use_4_bit_div").classList.add("hidden");
 	document.getElementById("modellayers").classList.add("hidden");
 	document.getElementById("oaimodel").classList.add("hidden");
 	var model_layer_bars = document.getElementById('model_layer_bars');
@@ -1646,6 +1647,14 @@ function selected_model_info(data) {
 		document.getElementById("use_8_bit").checked = false;
 	}
 	
+	//hide or unhide 4 bit mode
+	if (data.bit_4_available) {
+		document.getElementById("use_4_bit_div").classList.remove("hidden");
+	} else {
+		document.getElementById("use_4_bit_div").classList.add("hidden");
+		document.getElementById("use_4_bit").checked = false;
+	}
+
 	//default URL loading
 	if (data.default_url != null) {
 		document.getElementById("modelurl").value = data.default_url;
@@ -1815,7 +1824,7 @@ function selected_model_info(data) {
 	}
 	accept.disabled = false;
 	
-	
+	set_4_bit_mode(invert=false);
 }
 
 function update_gpu_layers() {
@@ -1876,7 +1885,8 @@ function load_model() {
 			   'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(), 
 			   'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value, 
 			   'online_model': selected_models,
-			   'use_8_bit': document.getElementById('use_8_bit').checked};
+			   'use_8_bit': document.getElementById('use_8_bit').checked,
+			   'use_4_bit': document.getElementById('use_4_bit').checked};
 	socket.emit("load_model", message);
 	closePopups();
 }
@@ -3160,6 +3170,22 @@ function save_preset() {
 	closePopups();
 }
 
+function set_4_bit_mode(invert=true) {
+	bit_4_status = document.getElementById("use_4_bit").checked;
+	if (invert) {
+	bit_4_status = !bit_4_status;
+	}
+	if (bit_4_status) {
+		document.getElementById("modellayers").classList.add("hidden");
+		socket.emit("use_4_bit_toggle", {"use_4_bit": false});
+	} else {
+		document.getElementById("modellayers").classList.remove("hidden");
+		socket.emit("use_4_bit_toggle", {"use_4_bit": true});
+	}
+}
+
+
+
 //--------------------------------------------General UI Functions------------------------------------
 function set_ui_level(level) {
 	for (classname of ['setting_container', 'setting_container_single', 'setting_container_single_wide', 'biasing', 'palette_area', 'advanced_theme']) {
@@ -7301,4 +7327,4 @@ $el("#gamescreen").addEventListener("paste", function(event) {
 		false,
 		event.clipboardData.getData("text/plain")
 	);
-});
\ No newline at end of file
+});
diff --git a/templates/popups.html b/templates/popups.html
index 44cf7cb6..804b1b9f 100644
--- a/templates/popups.html
+++ b/templates/popups.html
@@ -75,6 +75,10 @@
 				<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_8_bit" checked>
 				<div class="box-label">Use 8 bit mode</div>
 			</div>
+			<div class="box flex-push-right hidden" id=use_4_bit_div>
+				<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_4_bit" checked>
+				<div class="box-label">Use 4 bit mode</div>
+			</div>
 			<button type="button" class="btn popup_load_cancel_button action_button disabled" onclick="load_model()" id="btn_loadmodelaccept" disabled>Load</button>
 			<button type="button" class="btn popup_load_cancel_button" onclick='closePopups();' id="btn_loadmodelclose">Cancel</button>
 		</div>
@@ -402,4 +406,4 @@
 	</div>
 </div>
 
-<div id="notification-container"></div>
\ No newline at end of file
+<div id="notification-container"></div>

From c7edc764b95d44603e4d450d4326ce3628188ef3 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 21 Mar 2023 21:58:31 +0000
Subject: [PATCH 12/47] Fix llama loading

---
 aiserver.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 7497dfb9..967af85f 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2915,10 +2915,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                     @functools.lru_cache(maxsize=None)
                     def get_original_key(key):
-                        # try:
-                        return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
-                        # except ValueError:
-                        #     return key
+                        try:
+                            return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
+                        except ValueError:
+                            return key
 
                     for key, value in model_dict.items():
                         original_key = get_original_key(key)
@@ -3128,8 +3128,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
 
                         if not os.path.isfile(path_4bit):
-                            print(f"4-bit file {path_4bit} not found, aborting 4-bit load")
-                            use_4_bit = False
+                            print(f"4-bit file {path_4bit} not found, loading failed")
+                            raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
 
                         if use_4_bit:
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
@@ -3155,7 +3155,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                         tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
                                     except Exception as e:
                                         tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
-                            model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                            model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
 
                         if model is None:
                             raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")

From 8941428c66c377baa10aa95afd3186733dd92b89 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 22 Mar 2023 06:22:34 +0000
Subject: [PATCH 13/47] Fix Kobold loading to CPU in 4-bit, causing CUDA ASSERT
 error

---
 aiserver.py | 6 ++++--
 repos/gptq  | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 967af85f..2c50cfcc 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3102,7 +3102,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 
                 # If we're using torch_lazy_loader, we need to get breakmodel config
                 # early so that it knows where to load the individual model tensors
-                if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel:
+                if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel and not use_4_bit:
                     device_config(model_config)
 
                 # Download model from Huggingface if it does not exist, otherwise load locally
@@ -3133,6 +3133,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                         if use_4_bit:
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
+                            koboldai_vars.breakmodel = False
+                            koboldai_vars.usegpu = True
                             if koboldai_vars.model_type == "gptj":
                                 model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
@@ -3255,7 +3257,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         generator = model.generate
                     elif(koboldai_vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
-                        if(not koboldai_vars.lazy_load):
+                        if(not koboldai_vars.lazy_load and not use_4_bit):
                             device_config(model.config)
                         move_model_to_devices(model, use_4_bit)
                     elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
diff --git a/repos/gptq b/repos/gptq
index a8303654..791cfe37 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit a8303654c200c25577130466e5f9bc1e70fc8a50
+Subproject commit 791cfe376af33aa01032dd52147050083a6345cf

From 026eb3205e0f48dac5a4aa965d3e48d79ec5e1ab Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 22 Mar 2023 22:12:06 +0000
Subject: [PATCH 14/47] Fix 4-bit loading error when not loading in 4-bit

---
 aiserver.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 2c50cfcc..745a7cb8 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3125,13 +3125,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         lowmem = {}
                     if(os.path.isdir(koboldai_vars.custmodpth)):
 
-                        path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
-
-                        if not os.path.isfile(path_4bit):
-                            print(f"4-bit file {path_4bit} not found, loading failed")
-                            raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
-
                         if use_4_bit:
+                            path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
+
+                            if not os.path.isfile(path_4bit):
+                                print(f"4-bit file {path_4bit} not found, loading failed")
+                                raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
+
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
                             koboldai_vars.breakmodel = False
                             koboldai_vars.usegpu = True

From 9dcba3897810499786d1fb4b4bd8d41ef595a130 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 24 Mar 2023 19:07:28 +0000
Subject: [PATCH 15/47] Pin transformers to a working Llama-compatible version

---
 environments/huggingface.yml | 2 +-
 environments/rocm.yml        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 42dda9c3..6807627e 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -30,7 +30,7 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - git+https://github.com/zphang/transformers@llama_push
+    - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24
     - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
     - huggingface_hub==0.12.1
     - safetensors
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 43fd331f..a1d3d8b0 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -29,7 +29,7 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - transformers==4.25.1
+    - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24
     - huggingface_hub==0.12.1
     - safetensors
     - accelerate

From 2e7a8a1a66a3813ff2f68b5e37f659479f44afc2 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Thu, 23 Mar 2023 05:53:30 +0000
Subject: [PATCH 16/47] Adapt KoboldAI to latest gptq changes

---
 aiserver.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 745a7cb8..faee85c0 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3136,13 +3136,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             koboldai_vars.breakmodel = False
                             koboldai_vars.usegpu = True
                             if koboldai_vars.model_type == "gptj":
-                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "gpt_neox":
-                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "llama":
-                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")

From d1a2005a2710e0720fe2a863ebe4f5d1f9b2ad18 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 27 Mar 2023 20:45:21 +0000
Subject: [PATCH 17/47] Add support for old and new 4-bit format. Old one needs
 4bit-old.pt file to launch

---
 aiserver.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index faee85c0..fa2af0f3 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -94,7 +94,6 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
 from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
-vars_4bit = {}
 
 
 if lupa.LUA_VERSION[:2] != (5, 4):
@@ -3127,9 +3126,29 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                         if use_4_bit:
                             path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
+                            path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt")
+
+                            # Monkey-patch in old-format pt-file support
+                            if not os.path.isfile(path_4bit):
+                                print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}")
+                                path_4bit = path_4bit_old
+
+                                import llama, opt, gptneox, gptj, old_quant, quant_cuda_old
+                                llama.make_quant = old_quant.old_make_quant
+                                opt.make_quant = old_quant.old_make_quant
+                                gptneox.make_quant = old_quant.old_make_quant
+                                gptj.make_quant = old_quant.old_make_quant
+                            elif llama.make_quant == old_quant.old_make_quant:
+                                # Undo monkey patch
+                                import quant
+                                llama.make_quant = quant.make_quant
+                                opt.make_quant = quant.make_quant
+                                gptneox.make_quant = quant.make_quant
+                                gptj.make_quant = quant.make_quant
+
 
                             if not os.path.isfile(path_4bit):
-                                print(f"4-bit file {path_4bit} not found, loading failed")
+                                print(f"4-bit old-format file {path_4bit} not found, loading failed")
                                 raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
 
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")

From 0f1fc46078f9a751e35c0c5e7e35d091a10f3f9b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 27 Mar 2023 21:30:43 +0000
Subject: [PATCH 18/47] Fix errors during inference

---
 aiserver.py | 14 +++++++++++---
 repos/gptq  |  2 +-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index fa2af0f3..2c2eff1b 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -94,6 +94,7 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
 from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
+monkey_patched_4bit = False
 
 
 if lupa.LUA_VERSION[:2] != (5, 4):
@@ -3128,23 +3129,28 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
                             path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt")
 
+                            global monkey_patched_4bit
+
                             # Monkey-patch in old-format pt-file support
                             if not os.path.isfile(path_4bit):
                                 print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}")
                                 path_4bit = path_4bit_old
 
-                                import llama, opt, gptneox, gptj, old_quant, quant_cuda_old
+                                import llama, opt, gptneox, gptj, old_quant
                                 llama.make_quant = old_quant.old_make_quant
                                 opt.make_quant = old_quant.old_make_quant
                                 gptneox.make_quant = old_quant.old_make_quant
                                 gptj.make_quant = old_quant.old_make_quant
-                            elif llama.make_quant == old_quant.old_make_quant:
+                                monkey_patched_4bit = True
+                            elif monkey_patched_4bit:
                                 # Undo monkey patch
-                                import quant
+                                print("Undoing 4-bit old format monkey patch")
+                                import llama, opt, gptneox, gptj, quant
                                 llama.make_quant = quant.make_quant
                                 opt.make_quant = quant.make_quant
                                 gptneox.make_quant = quant.make_quant
                                 gptj.make_quant = quant.make_quant
+                                monkey_patched_4bit = False
 
 
                             if not os.path.isfile(path_4bit):
@@ -3165,6 +3171,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")
+
+                            model = model.float()
                         else:
                             try:
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
diff --git a/repos/gptq b/repos/gptq
index 791cfe37..0748a680 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 791cfe376af33aa01032dd52147050083a6345cf
+Subproject commit 0748a680e95ab0a9f8860953a5d705a01070d1cc

From ef6fe680a97efb740db946c0e4fbf5d2dd54889b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 28 Mar 2023 06:30:02 +0000
Subject: [PATCH 19/47] Fix high VRAM usage caused by workaround for scalar
 type error

---
 aiserver.py | 2 +-
 repos/gptq  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 2c2eff1b..27cafd59 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3172,7 +3172,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")
 
-                            model = model.float()
+                            model = model.half()
                         else:
                             try:
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
diff --git a/repos/gptq b/repos/gptq
index 0748a680..5d07f25a 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 0748a680e95ab0a9f8860953a5d705a01070d1cc
+Subproject commit 5d07f25a30f8602aedb3e69f11de07624e486ce9

From e698f22706c806e05fdd8c58f91f3d560bcba0d6 Mon Sep 17 00:00:00 2001
From: Digitous <107712289+Digitous@users.noreply.github.com>
Date: Tue, 28 Mar 2023 19:14:46 -0400
Subject: [PATCH 20/47] Update README.md

---
 README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/README.md b/README.md
index 20a1957a..c6e922aa 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,57 @@
+## This is a fork of KoboldAI that implements 4bit GPTQ quantized support to include Llama.
+
+### Install/Use Guide
+(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
+
+In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
+
+git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules
+
+cd KoboldAI
+
+Next step, subfolder mode or B: option doesn't matter choose either
+
+[if on Windows] install_requirements.bat if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
+
+[if on Linux] install_requirements.sh
+
+
+[if on Windows] run commandline.bat
+
+[if on Linux] run commandline.sh
+
+commandline.bat/commandline.sh will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt).
+
+
+cd repos
+
+cd gptq
+
+
+[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] python setup_cuda.py install
+
+[if on Linux] python setup_cuda.py install
+
+After the Cuda kernel is compiled, return to KoboldAI base directory
+
+[if on Windows (only applies to windows users)] pip install flask_cors
+
+If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
+
+Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
+
+Then move your model folder to KoboldAI/models, and rename the .pt in your model folder to 4bit.pt
+
+So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
+
+If you haven't done so already, exit the command prompt/leave KAI's (base) venv
+
+Run play.bat [windows] or play.sh [linux]
+
+Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on.
+
+The 4bit toggle shows when a model to load is selected.
+
 ## KoboldAI - Your gateway to GPT writing
 
 This is a browser-based front-end for AI-assisted writing with multiple local & remote AI models. It offers the standard array of tools, including Memory, Author's Note, World Info, Save & Load, adjustable AI settings, formatting options, and the ability to import existing AI Dungeon adventures. You can also turn on Adventure mode and play the game like AI Dungeon Unleashed.

From 8d008b87a608beb47e5f41473a40b437aa33d4b4 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 13:25:06 +0000
Subject: [PATCH 21/47] Add OPT support

---
 aiserver.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/aiserver.py b/aiserver.py
index 27cafd59..edce6bf1 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -94,6 +94,7 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
 from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
+from opt import load_quant as opt_load_quant
 monkey_patched_4bit = False
 
 
@@ -3169,6 +3170,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             elif koboldai_vars.model_type == "llama":
                                 model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
+                            elif koboldai_vars.model_type == "opt":
+                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")
 

From f6f7687cc015821c4d4b4cff7dbfea1052514efb Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 14:47:59 +0000
Subject: [PATCH 22/47] Add 4bit safetensor support, improve loading code

---
 aiserver.py | 78 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 31 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index edce6bf1..2679ddc8 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -90,6 +90,7 @@ global tpu_mtj_backend
 
 # 4-bit dependencies
 from pathlib import Path
+import glob
 sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
 from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
@@ -2657,6 +2658,50 @@ def unload_model():
         
     #Reload our badwords
     koboldai_vars.badwordsids = koboldai_settings.badwordsids_default
+
+
+def prepare_4bit_load(modelpath):
+    paths_4bit = ["4bit.pt", "4bit.safetensors"]
+    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
+    result = False
+    for p in paths_4bit:
+        p = os.path.join(modelpath, p)
+        if os.path.isfile(p):
+            result = p
+            break
+
+    global monkey_patched_4bit
+
+    # Monkey-patch in old-format pt-file support
+    if not result:
+        print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}")
+        for p in paths_4bit_old:
+            p = os.path.join(modelpath, p)
+            if os.path.isfile(p):
+                result = p
+                break
+
+        if not result:
+            print(f"4-bit old-format file {path_4bit} not found, loading failed")
+            raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
+
+        import llama, opt, gptneox, gptj, old_quant
+        llama.make_quant = old_quant.old_make_quant
+        opt.make_quant = old_quant.old_make_quant
+        gptneox.make_quant = old_quant.old_make_quant
+        gptj.make_quant = old_quant.old_make_quant
+        monkey_patched_4bit = True
+    elif monkey_patched_4bit:
+        # Undo monkey patch
+        print("Undoing 4-bit old format monkey patch")
+        import llama, opt, gptneox, gptj, quant
+        llama.make_quant = quant.make_quant
+        opt.make_quant = quant.make_quant
+        gptneox.make_quant = quant.make_quant
+        gptj.make_quant = quant.make_quant
+        monkey_patched_4bit = False
+
+    return result
     
     
 def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
@@ -3127,36 +3172,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(os.path.isdir(koboldai_vars.custmodpth)):
 
                         if use_4_bit:
-                            path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt")
-                            path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt")
-
-                            global monkey_patched_4bit
-
-                            # Monkey-patch in old-format pt-file support
-                            if not os.path.isfile(path_4bit):
-                                print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}")
-                                path_4bit = path_4bit_old
-
-                                import llama, opt, gptneox, gptj, old_quant
-                                llama.make_quant = old_quant.old_make_quant
-                                opt.make_quant = old_quant.old_make_quant
-                                gptneox.make_quant = old_quant.old_make_quant
-                                gptj.make_quant = old_quant.old_make_quant
-                                monkey_patched_4bit = True
-                            elif monkey_patched_4bit:
-                                # Undo monkey patch
-                                print("Undoing 4-bit old format monkey patch")
-                                import llama, opt, gptneox, gptj, quant
-                                llama.make_quant = quant.make_quant
-                                opt.make_quant = quant.make_quant
-                                gptneox.make_quant = quant.make_quant
-                                gptj.make_quant = quant.make_quant
-                                monkey_patched_4bit = False
-
-
-                            if not os.path.isfile(path_4bit):
-                                print(f"4-bit old-format file {path_4bit} not found, loading failed")
-                                raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
+                            path_4bit = prepare_4bit_load(koboldai_vars.custmodpth)
 
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
                             koboldai_vars.breakmodel = False
@@ -3171,7 +3187,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "opt":
-                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4)
+                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")

From a0bc77042624571b878d734ebc41331f6f4d9342 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 19:49:05 +0000
Subject: [PATCH 23/47] Add basic groupsize support

Write groupsize into filename, for example 4bit-128b.safetensors for groupsize 128
---
 aiserver.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 2679ddc8..38805287 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2661,13 +2661,19 @@ def unload_model():
 
 
 def prepare_4bit_load(modelpath):
-    paths_4bit = ["4bit.pt", "4bit.safetensors"]
+    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
     paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
     result = False
+    groupsize = -1
     for p in paths_4bit:
         p = os.path.join(modelpath, p)
-        if os.path.isfile(p):
-            result = p
+        val = glob.glob(p)
+        if val:
+            result = val[0]
+            fname = Path(result).parts[-1]
+            g = re.findall("^(?:4bit)(?:-)(\d+)(?:b-?)", fname)
+            if g:
+                groupsize = int(g[0])
             break
 
     global monkey_patched_4bit
@@ -2701,7 +2707,7 @@ def prepare_4bit_load(modelpath):
         gptj.make_quant = quant.make_quant
         monkey_patched_4bit = False
 
-    return result
+    return result, groupsize
     
     
 def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
@@ -3172,22 +3178,23 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(os.path.isdir(koboldai_vars.custmodpth)):
 
                         if use_4_bit:
-                            path_4bit = prepare_4bit_load(koboldai_vars.custmodpth)
+                            path_4bit, groupsize = prepare_4bit_load(koboldai_vars.custmodpth)
+                            print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
 
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
                             koboldai_vars.breakmodel = False
                             koboldai_vars.usegpu = True
                             if koboldai_vars.model_type == "gptj":
-                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
+                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "gpt_neox":
-                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
+                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "llama":
-                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
+                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "opt":
-                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1)
+                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")

From 73d5ec0e5dd234852a66331b681734e8beb13781 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 20:07:26 +0000
Subject: [PATCH 24/47] Pull latest gptq-changes

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index 5d07f25a..6f80e1fd 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 5d07f25a30f8602aedb3e69f11de07624e486ce9
+Subproject commit 6f80e1fdd7232d66e45e02ebf00be4c5d5f933bc

From 9d0477f5f73471995fa3e23789a0ac4aa9108b33 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 22:05:44 +0000
Subject: [PATCH 25/47] Fix bug where it picks old model despite new one
 available

---
 aiserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 38805287..812bc4a8 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2667,7 +2667,7 @@ def prepare_4bit_load(modelpath):
     groupsize = -1
     for p in paths_4bit:
         p = os.path.join(modelpath, p)
-        val = glob.glob(p)
+        val = [v for v in glob.glob(p) if "4bit-old" not in v]
         if val:
             result = val[0]
             fname = Path(result).parts[-1]

From 61b13604b6ad116561488ab146c3959f40d98099 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Thu, 30 Mar 2023 10:57:04 +0200
Subject: [PATCH 26/47] Fix bug in 4-bit load fallback

---
 aiserver.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 812bc4a8..fe0f9a8c 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2680,7 +2680,7 @@ def prepare_4bit_load(modelpath):
 
     # Monkey-patch in old-format pt-file support
     if not result:
-        print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}")
+        print("4-bit file not found, falling back to old format.")
         for p in paths_4bit_old:
             p = os.path.join(modelpath, p)
             if os.path.isfile(p):
@@ -2688,8 +2688,8 @@ def prepare_4bit_load(modelpath):
                 break
 
         if not result:
-            print(f"4-bit old-format file {path_4bit} not found, loading failed")
-            raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}")
+            print("4-bit old-format file not found, loading failed.")
+            raise RuntimeError(f"4-bit load failed. PT-File not found.")
 
         import llama, opt, gptneox, gptj, old_quant
         llama.make_quant = old_quant.old_make_quant

From aa2292b3a4dff467e9afaa3270d80fcda4c7994f Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 29 Mar 2023 21:43:49 +0000
Subject: [PATCH 27/47] Enable multi-gpu support

---
 aiserver.py        | 50 ++++++++++++++++------------------------------
 static/koboldai.js |  9 +--------
 2 files changed, 18 insertions(+), 41 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index fe0f9a8c..7a4370c0 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1139,7 +1139,7 @@ def move_model_to_devices(model, use_4_bit=False):
         import accelerate.utils
         for key, value in model.state_dict().items():
             target_dtype = torch.float32 if breakmodel.primary_device == "cpu" else torch.float16
-            if(value.dtype is not target_dtype):
+            if(value.dtype not in (torch.bool, torch.int) and value.dtype is not target_dtype):
                 accelerate.utils.set_module_tensor_to_device(model, key, target_dtype)
         disk_blocks = breakmodel.disk_blocks
         gpu_blocks = breakmodel.gpu_blocks
@@ -1919,18 +1919,6 @@ def get_cluster_models(msg):
     emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2")
 
 
-@socketio.on("use_4_bit_toggle")
-def use_4_bit_toggle(msg):
-    # Disable lazy_load and breakmodel
-    if msg["use_4_bit"]:
-        koboldai_vars.lazy_load = False
-        koboldai_vars.nobreakmodel = True
-    else:
-        koboldai_vars.lazy_load = True
-        koboldai_vars.nobreakmodel = False
-
-    # TODO: Reload JS values for this stuff
-
 # Function to patch transformers to use our soft prompt
 def patch_causallm(model):
     from torch.nn import Embedding
@@ -3033,11 +3021,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
                                 #print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
                                 model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
-                                if not use_4_bit:
-                                    if model_dict[key].dtype is torch.float32:
-                                        koboldai_vars.fp32_model = True
-                                    if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
-                                        model_dict[key] = model_dict[key].to(torch.float16)
+                                if model_dict[key].dtype is torch.float32:
+                                    koboldai_vars.fp32_model = True
+                                if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
+                                    model_dict[key] = model_dict[key].to(torch.float16)
                                 if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
                                     model_dict[key] = model_dict[key].to(torch.float32)
                                 if device == "shared":
@@ -3061,17 +3048,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 if utils.offload_index:
                                     for name, tensor in utils.named_buffers:
                                         dtype = tensor.dtype
-                                        if not use_4_bit:
-                                            if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
-                                                dtype = torch.float16
-                                            if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
-                                                dtype = torch.float32
-                                            if name in model_dict and model_dict[name].dtype is not dtype:
-                                                model_dict[name] = model_dict[name].to(dtype)
-                                            if tensor.dtype is not dtype:
-                                                tensor = tensor.to(dtype)
-                                            if name not in utils.offload_index:
-                                                accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
+                                        if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
+                                            dtype = torch.float16
+                                        if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
+                                            dtype = torch.float32
+                                        if name in model_dict and model_dict[name].dtype is not dtype:
+                                            model_dict[name] = model_dict[name].to(dtype)
+                                        if tensor.dtype is not dtype:
+                                            tensor = tensor.to(dtype)
+                                        if name not in utils.offload_index:
+                                            accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
                                     accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
                                 utils.bar.close()
                                 utils.bar = None
@@ -3154,7 +3140,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 
                 # If we're using torch_lazy_loader, we need to get breakmodel config
                 # early so that it knows where to load the individual model tensors
-                if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel and not use_4_bit:
+                if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel:
                     device_config(model_config)
 
                 # Download model from Huggingface if it does not exist, otherwise load locally
@@ -3182,8 +3168,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
 
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
-                            koboldai_vars.breakmodel = False
-                            koboldai_vars.usegpu = True
                             if koboldai_vars.model_type == "gptj":
                                 model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
@@ -3311,7 +3295,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         generator = model.generate
                     elif(koboldai_vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
-                        if(not koboldai_vars.lazy_load and not use_4_bit):
+                        if(not koboldai_vars.lazy_load):
                             device_config(model.config)
                         move_model_to_devices(model, use_4_bit)
                     elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
diff --git a/static/koboldai.js b/static/koboldai.js
index 05dcc47e..89ee2ea1 100644
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -3173,14 +3173,7 @@ function save_preset() {
 function set_4_bit_mode(invert=true) {
 	bit_4_status = document.getElementById("use_4_bit").checked;
 	if (invert) {
-	bit_4_status = !bit_4_status;
-	}
-	if (bit_4_status) {
-		document.getElementById("modellayers").classList.add("hidden");
-		socket.emit("use_4_bit_toggle", {"use_4_bit": false});
-	} else {
-		document.getElementById("modellayers").classList.remove("hidden");
-		socket.emit("use_4_bit_toggle", {"use_4_bit": true});
+		bit_4_status = !bit_4_status;
 	}
 }
 

From 6eae4574793687b517c45e85e5fc178015c8d088 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 31 Mar 2023 15:36:03 +0200
Subject: [PATCH 28/47] Fix 4bit groupsize param letter

Use g instead of b for groupsize name, for example 4bit-128g.safetensors
---
 aiserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 7a4370c0..e7c789ac 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2659,7 +2659,7 @@ def prepare_4bit_load(modelpath):
         if val:
             result = val[0]
             fname = Path(result).parts[-1]
-            g = re.findall("^(?:4bit)(?:-)(\d+)(?:b-?)", fname)
+            g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname)
             if g:
                 groupsize = int(g[0])
             break

From d3a5ca65057f4f7cf9a2998cd13e5e04de829df1 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 1 Apr 2023 08:52:08 +0000
Subject: [PATCH 29/47] Update gptq submodule to latest

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index 6f80e1fd..f4de1019 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 6f80e1fdd7232d66e45e02ebf00be4c5d5f933bc
+Subproject commit f4de1019fedce779915e050a517f4cc8dee432dc

From bf0c999412b48a6de6a174a33bce3f8b92df1e16 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 1 Apr 2023 14:19:51 +0200
Subject: [PATCH 30/47] Update GPTQ to support AMD

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index f4de1019..954b3218 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit f4de1019fedce779915e050a517f4cc8dee432dc
+Subproject commit 954b32183adda2acd437a3ab0683a28ca3c7e4c9

From 110f8229c565a1ac64060e4e1785d4563920d4f4 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 1 Apr 2023 21:33:05 +0200
Subject: [PATCH 31/47] Add cudatoolkit-dev for compilation, compatible gcc 9
 and update transformers to fix error in gptq

---
 environments/huggingface.yml | 5 ++++-
 environments/rocm.yml        | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 6807627e..71d26e9c 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -11,6 +11,9 @@ dependencies:
   - pytorch=1.11.*
   - python=3.8.*
   - cudatoolkit=11.1
+  - cudatoolkit-dev=11.1
+  - gcc=9.*
+  - gxx=9.*
   - eventlet=0.33.3
   - dnspython=2.2.1
   - markdown
@@ -30,7 +33,7 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24
+    - git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda
     - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
     - huggingface_hub==0.12.1
     - safetensors
diff --git a/environments/rocm.yml b/environments/rocm.yml
index a1d3d8b0..dda2a2b2 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -29,7 +29,7 @@ dependencies:
     - flask-ngrok
     - flask-cors
     - lupa==1.10
-    - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24
+    - git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda
     - huggingface_hub==0.12.1
     - safetensors
     - accelerate

From 2729b7764047b7c1d35f7a20e5900d61147fe598 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 2 Apr 2023 10:32:19 +0200
Subject: [PATCH 32/47] Add offload.py adapted from llama_inference_offload.py,
 with multi-gpu support and some improvements. Not yet functional, and still
 just supports Llama

---
 aiserver.py | 17 +++++++++++++++--
 repos/gptq  |  2 +-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index e7c789ac..82992461 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -96,6 +96,7 @@ from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
 from opt import load_quant as opt_load_quant
+from offload import load_quant_offload
 monkey_patched_4bit = False
 
 
@@ -3137,6 +3138,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 if(koboldai_vars.model_type == "gpt2"):
                     lowmem = {}
                     koboldai_vars.lazy_load = False  # Also, lazy loader doesn't support GPT-2 models
+
+                gpu_layers_list = [int(l) for l in gpu_layers.split(",")]
+                offload_4bit = use_4_bit and sum(gpu_layers_list) < utils.num_layers(model_config)
+
+                if offload_4bit:
+                    koboldai_vars.lazy_load = False
                 
                 # If we're using torch_lazy_loader, we need to get breakmodel config
                 # early so that it knows where to load the individual model tensors
@@ -3175,7 +3182,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "llama":
-                                model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                if offload_4bit:
+                                    model = load_quant_offload(llama_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                                else:
+                                    model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "opt":
                                 model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
@@ -3286,7 +3296,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 patch_causallm(model)
 
                 if(koboldai_vars.hascuda):
-                    if(koboldai_vars.usegpu):
+                    if offload_4bit:
+                        koboldai_vars.modeldim = get_hidden_size_from_model(model)
+                        generator = model.generate
+                    elif(koboldai_vars.usegpu):
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         if not use_4_bit:
                             model = model.half().to(koboldai_vars.gpu_device)
diff --git a/repos/gptq b/repos/gptq
index 954b3218..f8bc2886 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 954b32183adda2acd437a3ab0683a28ca3c7e4c9
+Subproject commit f8bc2886cb2e2aaa704ea02404c2ff3841eb6fcf

From e742083703ea8111379492c75e62f9dfffd54a28 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 2 Apr 2023 11:17:29 +0200
Subject: [PATCH 33/47] Fix multi-gpu-offloading

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index f8bc2886..971a5785 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit f8bc2886cb2e2aaa704ea02404c2ff3841eb6fcf
+Subproject commit 971a5785a356460f2073b0141da0c1e0b8fdcbf6

From c8d00b7a10fd48f31f9d3fc4f4010f5481c772d4 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 2 Apr 2023 18:36:31 +0200
Subject: [PATCH 34/47] Add CPU offloading support for GPT-NeoX, GPT-J and OPT

---
 aiserver.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 82992461..2365f58b 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3144,6 +3144,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                 if offload_4bit:
                     koboldai_vars.lazy_load = False
+                    print("4-bit CPU offloader active")
                 
                 # If we're using torch_lazy_loader, we need to get breakmodel config
                 # early so that it knows where to load the individual model tensors
@@ -3176,10 +3177,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                             print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
                             if koboldai_vars.model_type == "gptj":
-                                model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                if offload_4bit:
+                                    model = load_quant_offload(gptj_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                                else:
+                                    model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "gpt_neox":
-                                model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                if offload_4bit:
+                                    model = load_quant_offload(gptneox_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                                else:
+                                    model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "llama":
                                 if offload_4bit:
@@ -3188,7 +3195,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                     model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             elif koboldai_vars.model_type == "opt":
-                                model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
+                                if offload_4bit:
+                                    model = load_quant_offload(opt_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
+                                else:
+                                    model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
                                 tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
                             else:
                                 raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")

From ec4177a6d6cf3549f3aebffc1a54b4799c506657 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 3 Apr 2023 06:50:36 +0200
Subject: [PATCH 35/47] Remove cudatoolkit-dev and gcc/gxx 9 from conda env
 because they didn't resolve on Windows

---
 environments/huggingface.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 71d26e9c..b48c2547 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -11,9 +11,6 @@ dependencies:
   - pytorch=1.11.*
   - python=3.8.*
   - cudatoolkit=11.1
-  - cudatoolkit-dev=11.1
-  - gcc=9.*
-  - gxx=9.*
   - eventlet=0.33.3
   - dnspython=2.2.1
   - markdown

From b9df9b6f590388a8fc6139e25b1d1c24c21fac52 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 3 Apr 2023 20:27:17 +0200
Subject: [PATCH 36/47] Improve CPU offloading speed significantly when
 offloading less than half of the layers

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index 971a5785..e2f567e9 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 971a5785a356460f2073b0141da0c1e0b8fdcbf6
+Subproject commit e2f567e9332c2d92f1c26ea0f7c935fe7cd65655

From ce6761e74436298424d3ea7bb964bb243e8cd88a Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 4 Apr 2023 07:46:53 +0200
Subject: [PATCH 37/47] Fix issue causing expected scalar type Float but found
 Half RuntimeErrors

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index e2f567e9..08c5054d 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit e2f567e9332c2d92f1c26ea0f7c935fe7cd65655
+Subproject commit 08c5054d45b8c6277e74a35841570dc7b8cbc608

From 8b4375307c2e4ea1154125fea1e00ef8c1b38415 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 5 Apr 2023 21:10:40 +0200
Subject: [PATCH 38/47] Update file formatting section in guide

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c6e922aa..0296e876 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ If you haven't already done so, create a model folder with the same name as your
 
 Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
 
-Then move your model folder to KoboldAI/models, and rename the .pt in your model folder to 4bit.pt
+Then move your model folder to KoboldAI/models, and rename the .pt/safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
 
 So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
 

From 40092cc9faed0d225391699e4cada1b9fb043dff Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 5 Apr 2023 21:49:13 +0200
Subject: [PATCH 39/47] Improve guide formatting

---
 README.md | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 0296e876..e103bbff 100644
--- a/README.md
+++ b/README.md
@@ -5,48 +5,46 @@
 
 In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
 
-git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules
+`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
 
-cd KoboldAI
+`cd KoboldAI`
 
-Next step, subfolder mode or B: option doesn't matter choose either
+Next step, (Windows) subfolder mode or B: option doesn't matter choose either
 
-[if on Windows] install_requirements.bat if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
+[if on Windows] `install_requirements.bat` if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
 
-[if on Linux] install_requirements.sh
+[if on Linux] `install_requirements.sh`
 
 
-[if on Windows] run commandline.bat
+[if on Windows] run `commandline.bat`
 
-[if on Linux] run commandline.sh
+[if on Linux] run `commandline.sh`
 
-commandline.bat/commandline.sh will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt).
+`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt).
 
 
-cd repos
+`cd repos`
 
-cd gptq
+`cd gptq`
 
 
-[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] python setup_cuda.py install
+[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] `python setup_cuda.py install`
 
-[if on Linux] python setup_cuda.py install
+[if on Linux] `python setup_cuda.py install`
 
 After the Cuda kernel is compiled, return to KoboldAI base directory
 
-[if on Windows (only applies to windows users)] pip install flask_cors
-
 If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
 
-Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
+Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
 
-Then move your model folder to KoboldAI/models, and rename the .pt/safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
+Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
 
 So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
 
 If you haven't done so already, exit the command prompt/leave KAI's (base) venv
 
-Run play.bat [windows] or play.sh [linux]
+Run `play.bat` [windows] or `play.sh` [linux]
 
 Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on.
 

From 636c4e5a5284fa2a11af7aba2fdf55426047eb0f Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 7 Apr 2023 11:48:57 +0200
Subject: [PATCH 40/47] Update gptq repo

---
 repos/gptq | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/repos/gptq b/repos/gptq
index 08c5054d..17c46a59 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 08c5054d45b8c6277e74a35841570dc7b8cbc608
+Subproject commit 17c46a59ff20da657e68f3267f853b0243b983bc

From 7efd314428e0ad24b33fc9cd9ac19b45c6754e7b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 7 Apr 2023 20:10:24 +0200
Subject: [PATCH 41/47] Improve guide

---
 README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 86661df3..f9be9660 100644
--- a/README.md
+++ b/README.md
@@ -20,9 +20,11 @@ Next step, (Windows) subfolder mode or B: option doesn't matter choose either
 
 [if on Linux] run `commandline.sh`
 
-`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt).
-
+`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment.
+On Windows, this will show (base) at the start of the prompt line.
+If it shows (base) on Linux, you most likely have a base conda environment that you need to deactivate (`conda deactivate`)
 
+Then run
 `cd repos`
 
 `cd gptq`
@@ -42,7 +44,7 @@ Then move your model folder to KoboldAI/models, and rename the .pt or .safetenso
 
 So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
 
-If you haven't done so already, exit the command prompt/leave KAI's (base) venv
+If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
 
 Run `play.bat` [windows] or `play.sh` [linux]
 

From b628aec7194783da09035a3b8fe01f674df542ea Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 10 Apr 2023 22:37:16 +0200
Subject: [PATCH 42/47] Automatic installation of the quant_cuda module during
 install_requirements

Kepler (K40+) and Maxwell support
---
 install_requirements.bat | 4 ++++
 install_requirements.sh  | 3 +++
 repos/gptq               | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/install_requirements.bat b/install_requirements.bat
index 2a4534c1..05264259 100644
--- a/install_requirements.bat
+++ b/install_requirements.bat
@@ -49,6 +49,8 @@ umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\h
 umamba.exe -r B:\ clean -a -y
 rd B:\Python\pkgs /S /Q
 subst B: /d
+call B:\python\condabin\activate
+cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 pause
 exit
 
@@ -60,5 +62,7 @@ umamba.exe create -r miniconda3\ -n base
 umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy
 umamba.exe clean -a -y
 rd miniconda3\Python\pkgs /S /Q
+call miniconda3\condabin\activate
+cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 pause
 exit
diff --git a/install_requirements.sh b/install_requirements.sh
index 6f0e0dfd..7b5a8d5b 100755
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -5,6 +5,9 @@ wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
 # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
+
+# Install quant_cuda module for 4-bit
+bin/micromamba run -r runtime -n koboldai pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl
 exit
 fi
 if [[ $1 = "rocm" ]]; then
diff --git a/repos/gptq b/repos/gptq
index 17c46a59..50b22e2b 160000
--- a/repos/gptq
+++ b/repos/gptq
@@ -1 +1 @@
-Subproject commit 17c46a59ff20da657e68f3267f853b0243b983bc
+Subproject commit 50b22e2ba8ec0f5cf0dca719392a2ec5254e7228

From 687d107d20345a0cc46bb069914d0ce6a3bcf43d Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 10 Apr 2023 22:46:12 +0200
Subject: [PATCH 43/47] Update README, remove steps that are no longer required

---
 README.md | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/README.md b/README.md
index f9be9660..0657fa0b 100644
--- a/README.md
+++ b/README.md
@@ -15,27 +15,6 @@ Next step, (Windows) subfolder mode or B: option doesn't matter choose either
 
 [if on Linux] `install_requirements.sh`
 
-
-[if on Windows] run `commandline.bat`
-
-[if on Linux] run `commandline.sh`
-
-`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment.
-On Windows, this will show (base) at the start of the prompt line.
-If it shows (base) on Linux, you most likely have a base conda environment that you need to deactivate (`conda deactivate`)
-
-Then run
-`cd repos`
-
-`cd gptq`
-
-
-[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] `python setup_cuda.py install`
-
-[if on Linux] `python setup_cuda.py install`
-
-After the Cuda kernel is compiled, return to KoboldAI base directory
-
 If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
 
 Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).

From 35f908e147fcac121bdafaf7ca4b751d8091f480 Mon Sep 17 00:00:00 2001
From: biscober <50845461+biscober@users.noreply.github.com>
Date: Tue, 11 Apr 2023 02:37:48 +0000
Subject: [PATCH 44/47] Update install_requirements.bat (#7)

* Update install_requirements.bat

move command to dismount temp B drive to after pip install command which requires B drive to still be mounted

* Update install_requirements.bat

cmd /k not necessary

* Update install_requirements.bat

add quotes (probably not required but w/e)
---
 install_requirements.bat | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/install_requirements.bat b/install_requirements.bat
index 05264259..3b735ddf 100644
--- a/install_requirements.bat
+++ b/install_requirements.bat
@@ -48,9 +48,9 @@ umamba.exe create -r B:\python\ -n base
 umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy
 umamba.exe -r B:\ clean -a -y
 rd B:\Python\pkgs /S /Q
-subst B: /d
 call B:\python\condabin\activate
-cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
+pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
+subst B: /d
 pause
 exit
 
@@ -63,6 +63,6 @@ umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingf
 umamba.exe clean -a -y
 rd miniconda3\Python\pkgs /S /Q
 call miniconda3\condabin\activate
-cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
+pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 pause
 exit

From 3eda7269f72bfa954a13aebb9d965b9c7dad9e61 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Sat, 15 Apr 2023 14:58:24 +0200
Subject: [PATCH 45/47] Fix incorrect host merge

---
 aiserver.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 2ec6d817..59cfac0c 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1613,9 +1613,6 @@ def general_startup(override_args=None):
     if args.localtunnel:
         koboldai_vars.host = True;
 
-    if args.host == "":
-        koboldai_vars.host = True
-        args.unblock = True
     if args.host:
             # This means --host option was submitted without an argument
             # Enable all LAN IPs (0.0.0.0/0)

From 67334bd69848bc8f3c00f1015f9f95170d2c98a3 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Sat, 15 Apr 2023 17:45:00 +0200
Subject: [PATCH 46/47] Pin accelerate version

---
 environments/rocm.yml | 2 +-
 requirements.txt      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/environments/rocm.yml b/environments/rocm.yml
index e28c86cb..c3e95496 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -32,7 +32,7 @@ dependencies:
     - transformers==4.28.0
     - huggingface_hub==0.12.1
     - safetensors
-    - accelerate
+    - accelerate==0.18.0
     - git+https://github.com/VE-FORBRYDERNE/mkultra
     - ansi2html
     - flask_compress
diff --git a/requirements.txt b/requirements.txt
index c2a61ca6..23468656 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ markdown
 bleach==4.1.0
 sentencepiece
 protobuf
-accelerate
+accelerate==0.18.0
 flask-session==0.4.0
 marshmallow>=3.13
 apispec-webframeworks

From b68860b3de1adef4e162834bd524c92e39dbe264 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Sat, 15 Apr 2023 18:31:39 +0200
Subject: [PATCH 47/47] Workaround to make --host work again

---
 aiserver.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 59cfac0c..886a802e 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1482,7 +1482,7 @@ def general_startup(override_args=None):
     parser.add_argument("--noaimenu", action='store_true', help="Disables the ability to select the AI")
     parser.add_argument("--ngrok", action='store_true', help="Optimizes KoboldAI for Remote Play using Ngrok")
     parser.add_argument("--localtunnel", action='store_true', help="Optimizes KoboldAI for Remote Play using Localtunnel")
-    parser.add_argument("--host", type=str, default="", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc")
+    parser.add_argument("--host", type=str, default="Disabled", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc")
     parser.add_argument("--port", type=int, help="Specify the port on which the application will be joinable")
     parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)")
     parser.add_argument("--model", help="Specify the Model Type to skip the Menu")
@@ -1613,14 +1613,14 @@ def general_startup(override_args=None):
     if args.localtunnel:
         koboldai_vars.host = True;
 
-    if args.host:
+    if args.host != "Disabled":
             # This means --host option was submitted without an argument
             # Enable all LAN IPs (0.0.0.0/0)
+        koboldai_vars.host = True
+        args.unblock = True
         if args.host != "":
             # Check if --host option was submitted with an argument
             # Parse the supplied IP(s) and add them to the allowed IPs list
-            koboldai_vars.host = True
-            args.unblock = True
             enable_whitelist = True
             for ip_str in args.host.split(","):
                 if "/" in ip_str:
@@ -1637,6 +1637,7 @@ def general_startup(override_args=None):
             print(f"Allowed IPs: {allowed_ips}")
 
 
+
     if args.cpu:
         koboldai_vars.use_colab_tpu = False