From 8c9ed5540626655870b6c8e79b5a838f6f012a91 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Thu, 9 Mar 2023 22:36:45 -0500
Subject: [PATCH 1/5] Update aiserver.py

---
 aiserver.py | 63 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 13 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 7c60b04e..4174d1fa 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -87,6 +87,38 @@ from io import BytesIO
 
 global tpu_mtj_backend
 
+from transformers.models.llama.tokenization_llama import LLaMATokenizer
+from repos.gptq.gptq import *
+from repos.gptq.modelutils import *
+from repos.gptq.quant import *
+def load_quant(model, checkpoint, wbits):
+    from transformers import LLaMAConfig, LLaMAForCausalLM 
+    config = LLaMAConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop 
+    torch.nn.init.uniform_ = noop 
+    torch.nn.init.normal_ = noop 
+
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = LLaMAForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    make_quant(model, layers, wbits)
+
+    print('Loading model ...')
+    model.load_state_dict(torch.load(checkpoint))
+    model.seqlen = 2048
+    print('Done.')
+
+    return model
+
 
 if lupa.LUA_VERSION[:2] != (5, 4):
     logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
@@ -2886,7 +2918,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
                     @functools.lru_cache(maxsize=None)
                     def get_original_key(key):
-                        return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
+                        try:
+                            return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
+                        except ValueError:
+                            return key
 
                     for key, value in model_dict.items():
                         original_key = get_original_key(key)
@@ -3083,22 +3118,24 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(koboldai_vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                         lowmem = {}
                     if(os.path.isdir(koboldai_vars.custmodpth)):
+                        tokenizer = LLaMATokenizer.from_pretrained(koboldai_vars.custmodpth)
+                        # try:
+                        #     tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
+                        # except Exception as e:
+                        #     try:
+                        #         tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
+                        #     except Exception as e:
+                        #         try:
+                        #             tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
+                        #         except Exception as e:
+                        #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
                         try:
-                            tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
-                        except Exception as e:
-                            try:
-                                tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
-                            except Exception as e:
-                                try:
-                                    tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
-                                except Exception as e:
-                                    tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
-                        try:
-                            model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                            # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4)
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
-                            model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+                            # model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
                     elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)

From dcf9d37a00dc582618f10deef6d226f77018dc16 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Fri, 10 Mar 2023 00:01:40 -0500
Subject: [PATCH 2/5] It just works.

---
 aiserver.py | 46 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 4174d1fa..66aa7362 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1142,9 +1142,9 @@ def move_model_to_devices(model):
 
     if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel):
         if(koboldai_vars.usegpu):
-            model = model.half().to(koboldai_vars.gpu_device)
+            model = model.to(koboldai_vars.gpu_device)
         else:
-            model = model.to('cpu').float()
+            model = model.to('cpu')
         generator = model.generate
         return
 
@@ -1172,7 +1172,6 @@ def move_model_to_devices(model):
         generator = model.generate
         return
 
-    model.half()
     gc.collect()
 
     if(hasattr(model, "transformer")):
@@ -2983,10 +2982,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
                                 #print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
                                 model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
-                                if model_dict[key].dtype is torch.float32:
-                                    koboldai_vars.fp32_model = True
-                                if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
-                                    model_dict[key] = model_dict[key].to(torch.float16)
+                                # if model_dict[key].dtype is torch.float32:
+                                #     koboldai_vars.fp32_model = True
+                                # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
+                                #     model_dict[key] = model_dict[key].to(torch.float16)
                                 if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
                                     model_dict[key] = model_dict[key].to(torch.float32)
                                 if device == "shared":
@@ -3010,16 +3009,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 if utils.offload_index:
                                     for name, tensor in utils.named_buffers:
                                         dtype = tensor.dtype
-                                        if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
-                                            dtype = torch.float16
-                                        if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
-                                            dtype = torch.float32
-                                        if name in model_dict and model_dict[name].dtype is not dtype:
-                                            model_dict[name] = model_dict[name].to(dtype)
-                                        if tensor.dtype is not dtype:
-                                            tensor = tensor.to(dtype)
-                                        if name not in utils.offload_index:
-                                            accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
+                                        # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
+                                        #     dtype = torch.float16
+                                        # if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
+                                        #     dtype = torch.float32
+                                        # if name in model_dict and model_dict[name].dtype is not dtype:
+                                        #     model_dict[name] = model_dict[name].to(dtype)
+                                        # if tensor.dtype is not dtype:
+                                        #     tensor = tensor.to(dtype)
+                                        # if name not in utils.offload_index:
+                                        #     accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
                                     accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
                                 utils.bar.close()
                                 utils.bar = None
@@ -3078,10 +3077,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 koboldai_vars.modeldim = get_hidden_size_from_model(model)
                 # Is CUDA available? If so, use GPU, otherwise fall back to CPU
                 if(koboldai_vars.hascuda and koboldai_vars.usegpu):
-                    model = model.half().to(koboldai_vars.gpu_device)
+                    model = model.to(koboldai_vars.gpu_device)
                     generator = model.generate
                 else:
-                    model = model.to('cpu').float()
+                    model = model.to('cpu')
                     generator = model.generate
                 patch_causallm(model)
             # Use the Generic implementation
@@ -3131,7 +3130,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
                         try:
                             # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4)
+                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
@@ -3190,7 +3189,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             import shutil
                             tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
                             if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)):  # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case
-                                model = model.half()
                                 model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
                             else:  # For fp16 models, we can just copy the model files directly
                                 import transformers.configuration_utils
@@ -3224,7 +3222,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 if(koboldai_vars.hascuda):
                     if(koboldai_vars.usegpu):
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
-                        model = model.half().to(koboldai_vars.gpu_device)
+                        model = model.to(koboldai_vars.gpu_device)
                         generator = model.generate
                     elif(koboldai_vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
@@ -3236,7 +3234,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         generator = model.generate
                     else:
-                        model = model.to('cpu').float()
+                        model = model.to('cpu')
                         koboldai_vars.modeldim = get_hidden_size_from_model(model)
                         generator = model.generate
                 elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
@@ -3244,7 +3242,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     koboldai_vars.modeldim = get_hidden_size_from_model(model)
                     generator = model.generate
                 else:
-                    model.to('cpu').float()
+                    model.to('cpu')
                     koboldai_vars.modeldim = get_hidden_size_from_model(model)
                     generator = model.generate
             

From 3f132ce45ba61f30015147bb0d9ba26647204332 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Fri, 10 Mar 2023 03:26:09 -0500
Subject: [PATCH 3/5] Notify if LLAMA_4BIT env var not set

---
 aiserver.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 66aa7362..399ce434 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3130,7 +3130,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         #             tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
                         try:
                             # model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-                            model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
+                            if os.environ.get('LLAMA_4BIT') is not None:
+                                model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
+                            else:
+                                raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before procedding.")
+                                exit(1)
+
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")

From 1808b0d2eca42e30bee6edd6896744cfd6995ffc Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Sat, 11 Mar 2023 12:13:22 -0500
Subject: [PATCH 4/5] Another safety check for if model is not loaded

---
 aiserver.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/aiserver.py b/aiserver.py
index 399ce434..3ec8f284 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3133,13 +3133,17 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             if os.environ.get('LLAMA_4BIT') is not None:
                                 model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
                             else:
-                                raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before procedding.")
+                                raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
                                 exit(1)
 
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
                             # model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
+			
+			if model is None:
+				raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load. Exiting.")
+				exit(1)
                     elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)

From bde31217f164a3aadc4282913012378a886d6058 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Sat, 11 Mar 2023 12:15:58 -0500
Subject: [PATCH 5/5] improve model None check

---
 aiserver.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 3ec8f284..c14ac730 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -3134,16 +3134,14 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
                             else:
                                 raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
-                                exit(1)
+
+                            if model is None:
+                                raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")
 
                         except Exception as e:
                             if("out of memory" in traceback.format_exc().lower()):
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
                             # model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
-			
-			if model is None:
-				raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load. Exiting.")
-				exit(1)
                     elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)