From 1df03d9a27b86a086abafe80aefc0db67aa8e3f0 Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Sun, 23 Jul 2023 20:54:04 -0500
Subject: [PATCH 1/9] Basic

---
 .../inference_models/gptq_hf_torch/class.py   | 94 +++++++++++--------
 modeling/lazy_loader.py                       |  5 +
 modeling/patches.py                           | 19 ++++
 3 files changed, 79 insertions(+), 39 deletions(-)

diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 81a33c70..9a1b872e 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -89,6 +89,12 @@ class model_backend(HFTorchInferenceModel):
         return bool(gptq_model)
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
+        try:
+            import hf_bleeding_edge
+            from hf_bleeding_edge import AutoModelForCausalLM
+        except ImportError:
+            from transformers import AutoModelForCausalLM
+
         # Make model path the same as the model name to make this consistent
         # with the other loading method if it isn't a known model type. This
         # code is not just a workaround for below, it is also used to make the
@@ -98,7 +104,7 @@ class model_backend(HFTorchInferenceModel):
 
         self.init_model_config()
 
-        self.lazy_load = False
+        self.lazy_load = True
 
         gpulayers = self.breakmodel_config.gpu_blocks
 
@@ -181,50 +187,60 @@ class model_backend(HFTorchInferenceModel):
         model_type = self.get_model_type()
 
         logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}")
-        if model_type == "gptj":
-            model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "gpt_neox":
-            model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "llama":
-            model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "opt":
-            model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "mpt":
-            model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "gpt_bigcode":
-            model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
-        else:
-            try:
-                import auto_gptq
-                from auto_gptq import AutoGPTQForCausalLM
-            except ImportError:
-                raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
 
-            try:
-                import hf_bleeding_edge
-                from hf_bleeding_edge import AutoModelForCausalLM
-            except ImportError:
-                from transformers import AutoModelForCausalLM
 
-            # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
-            auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
-            auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
-            auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM
-            model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"))
+        with lazy_loader.use_lazy_load(
+            enable=self.lazy_load,
+            dematerialized_modules=False,
+        ):
+            print(self.lazy_load)
+            if model_type == "gptj":
+                model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+            elif model_type == "gpt_neox":
+                model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+            elif model_type == "llama":
+                print("LLLLLAAAMMMAA")
+                print(torch.load)
+                model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+            elif model_type == "opt":
+                model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+            elif model_type == "mpt":
+                model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+            elif model_type == "gpt_bigcode":
+                model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
+            else:
+                try:
+                    import auto_gptq
+                    from auto_gptq import AutoGPTQForCausalLM
+                except ImportError:
+                    raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
 
-            # Patch in embeddings function
-            def get_input_embeddings(self):
-                return self.model.get_input_embeddings()
+                try:
+                    import hf_bleeding_edge
+                    from hf_bleeding_edge import AutoModelForCausalLM
+                except ImportError:
+                    from transformers import AutoModelForCausalLM
 
-            type(model).get_input_embeddings = get_input_embeddings
+                # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
+                auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
+                auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
+                auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM
 
-            # Patch in args support..
-            def generate(self, *args, **kwargs):
-                """shortcut for model.generate"""
-                with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
-                    return self.model.generate(*args, **kwargs)
+                model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"))
 
-            type(model).generate = generate
+                # Patch in embeddings function
+                def get_input_embeddings(self):
+                    return self.model.get_input_embeddings()
+
+                type(model).get_input_embeddings = get_input_embeddings
+
+                # Patch in args support..
+                def generate(self, *args, **kwargs):
+                    """shortcut for model.generate"""
+                    with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
+                        return self.model.generate(*args, **kwargs)
+
+                type(model).generate = generate
 
         return model
 
diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py
index 69e0d948..8fff59d3 100644
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -358,16 +358,19 @@ def safetensors_load_tensor_independently(
 ) -> torch.Tensor:
     """A hacky way to load a tensor by itself and not mmap every single tensor
     or whatever is causing that big memory spike"""
+    print("[ld]", tensor_key)
 
     with safetensors.safe_open(checkpoint_file, framework="pt", device=device) as f:
         return f.get_tensor(tensor_key)
 
 
 def patch_safetensors(callback):
+    print("Hi! We are patching safetensors")
     # Safetensors load patch
     import transformers
 
     def safetensors_load(checkpoint_file: str) -> dict:
+        print("LOAD NOW", safetensors_load)
         # Monkeypatch applied to safetensors.torch.load_file
 
         if utils.koboldai_vars.hascuda:
@@ -409,6 +412,7 @@ def patch_safetensors(callback):
         return tensors
 
     transformers.modeling_utils.safe_load_file = safetensors_load
+    safetensors.torch.load_file = safetensors_load
 
 
 @contextlib.contextmanager
@@ -520,6 +524,7 @@ def use_lazy_load(
         old_torch_load = torch.load
 
         def torch_load(f, map_location=None, pickle_module=pickle, **pickle_load_args):
+            print("TORCHLOAD", f)
             model_dict = old_torch_load(
                 f=f,
                 map_location=map_location,
diff --git a/modeling/patches.py b/modeling/patches.py
index 6e2168f2..f5b6bd06 100644
--- a/modeling/patches.py
+++ b/modeling/patches.py
@@ -129,15 +129,34 @@ def patch_transformers_generation() -> None:
 
 
 class LazyloadPatches:
+    class StateDictFacade(dict):
+        def __init__(self, state_dict):
+            self.update(state_dict)
+
+        def __getitem__(self, name):
+            return super().__getitem__(name).materialize(map_location="cuda:0")
+
     old_load_state_dict = transformers.modeling_utils._load_state_dict_into_meta_model
+    torch_old_load_from_state_dict = torch.nn.Module._load_from_state_dict
 
     def __enter__() -> None:
         transformers.modeling_utils._load_state_dict_into_meta_model = (
             LazyloadPatches._load_state_dict_into_meta_model
         )
+        torch.nn.Module._load_from_state_dict = LazyloadPatches._torch_load_from_state_dict
+        # torch.nn.Module._load_from_state_dict = _agn
 
     def __exit__(exc_type, exc_value, exc_traceback) -> None:
         transformers.modeling_utils._load_state_dict_into_meta_model = LazyloadPatches.old_load_state_dict
+        torch.nn.Module._load_from_state_dict = LazyloadPatches.torch_old_load_from_state_dict
+
+    def _torch_load_from_state_dict(self, state_dict, *args, **kwargs):
+        return LazyloadPatches.torch_old_load_from_state_dict(
+            self,
+            LazyloadPatches.StateDictFacade(state_dict),
+            *args,
+            **kwargs
+        )
 
     def _load_state_dict_into_meta_model(
         model,

From a6aafb252534b26bbdf034788895c3317b4cdd53 Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Mon, 24 Jul 2023 13:07:30 -0500
Subject: [PATCH 2/9] GPTQ: Patch QuantLinear to not use CPU RAM

---
 .../inference_models/gptq_hf_torch/class.py   | 23 ++++++++++++++++++-
 modeling/lazy_loader.py                       |  3 +--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 9a1b872e..d942a539 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -7,7 +7,7 @@ import torch
 import re
 import shutil
 import sys
-from typing import Union
+from typing import Dict, Union
 
 import utils
 import modeling.lazy_loader as lazy_loader
@@ -167,6 +167,25 @@ class model_backend(HFTorchInferenceModel):
         self.model.kai_model = self
         utils.koboldai_vars.modeldim = self.get_hidden_size()
 
+    def _patch_quant(self) -> None:
+        # QuantLinear loads on the CPU by default, using a lot of RAM! If we
+        # load it to the same device that the weights are gonna be on, it
+        # mysteriously uses no additional VRAM
+
+        from gptq import quant_v3
+        from gptq import quant_v2
+        from gptq import quant_v1
+
+        def _ql_init_(self, *args, **kwargs):
+            ret = type(self)._unpatched_init(self, *args, **kwargs)
+            self.to("cuda:0")
+            return ret
+
+        for quant_module in [quant_v3, quant_v2, quant_v1]:
+            quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__
+            quant_module.QuantLinear.__init__ = _ql_init_
+
+
     def _get_model(self, location: str, tf_kwargs: Dict):
         import gptq
         from gptq.gptj import load_quant as gptj_load_quant
@@ -177,6 +196,8 @@ class model_backend(HFTorchInferenceModel):
         from gptq.mpt import load_quant as mpt_load_quant
         from gptq.offload import load_quant_offload
 
+        self._patch_quant()
+
         gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
         v2_bias = False
 
diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py
index 8fff59d3..a5e7c58f 100644
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -358,7 +358,6 @@ def safetensors_load_tensor_independently(
 ) -> torch.Tensor:
     """A hacky way to load a tensor by itself and not mmap every single tensor
     or whatever is causing that big memory spike"""
-    print("[ld]", tensor_key)
 
     with safetensors.safe_open(checkpoint_file, framework="pt", device=device) as f:
         return f.get_tensor(tensor_key)
@@ -379,7 +378,7 @@ def patch_safetensors(callback):
             # (70 tensors/s -> 65 tensor/s). The memory savings probably
             # shouldn't be the happening, maybe there's a memory leak
             # somewhere in our pipeline with CPU tensors.
-            intermediary_device = "cuda"
+            intermediary_device = "cuda:0"
         else:
             intermediary_device = "cpu"
 

From 4a6cccb00227561454e395b796aada44a60b05cf Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Mon, 24 Jul 2023 13:09:15 -0500
Subject: [PATCH 3/9] Import fix

---
 modeling/inference_models/gptq_hf_torch/class.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index d942a539..499b2682 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -90,7 +90,6 @@ class model_backend(HFTorchInferenceModel):
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
         try:
-            import hf_bleeding_edge
             from hf_bleeding_edge import AutoModelForCausalLM
         except ImportError:
             from transformers import AutoModelForCausalLM

From 929917efe9bb51aa4fe2147f6813205908efb3f6 Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Mon, 24 Jul 2023 13:09:43 -0500
Subject: [PATCH 4/9] Remove shrieking

---
 modeling/inference_models/gptq_hf_torch/class.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 499b2682..74f11e18 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -208,19 +208,15 @@ class model_backend(HFTorchInferenceModel):
 
         logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}")
 
-
         with lazy_loader.use_lazy_load(
             enable=self.lazy_load,
             dematerialized_modules=False,
         ):
-            print(self.lazy_load)
             if model_type == "gptj":
                 model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
             elif model_type == "gpt_neox":
                 model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
             elif model_type == "llama":
-                print("LLLLLAAAMMMAA")
-                print(torch.load)
                 model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
             elif model_type == "opt":
                 model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)

From 43a4abaf6320cc86e244cf103cc93b520339550e Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Mon, 24 Jul 2023 13:10:33 -0500
Subject: [PATCH 5/9] Remove even more debug

---
 modeling/lazy_loader.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py
index a5e7c58f..74770a1c 100644
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -176,9 +176,6 @@ class TorchLazyTensor(LazyTensor):
             CheckpointChunkCache.key = self.key
             ziproot = checkpoint.namelist()[0].split("/")[0]
             CheckpointChunkCache.handle = checkpoint.open(f"{ziproot}/data/{self.key}", "r")
-
-                
-                                
         else:
             # Cache hit. Hip hip hooray! :^)
             # print(".", end="", flush=True)
@@ -318,7 +315,6 @@ class _LazyUnpickler(RestrictedUnpickler):
     lazy_loaded_storages: Dict[str, LazyTensor]
 
     def __init__(self, *args, **kwargs):
-        # print(args, kwargs)
         self.lazy_loaded_storages = {}
         return super().__init__(*args, **kwargs)
 
@@ -364,12 +360,10 @@ def safetensors_load_tensor_independently(
 
 
 def patch_safetensors(callback):
-    print("Hi! We are patching safetensors")
     # Safetensors load patch
     import transformers
 
     def safetensors_load(checkpoint_file: str) -> dict:
-        print("LOAD NOW", safetensors_load)
         # Monkeypatch applied to safetensors.torch.load_file
 
         if utils.koboldai_vars.hascuda:
@@ -523,7 +517,6 @@ def use_lazy_load(
         old_torch_load = torch.load
 
         def torch_load(f, map_location=None, pickle_module=pickle, **pickle_load_args):
-            print("TORCHLOAD", f)
             model_dict = old_torch_load(
                 f=f,
                 map_location=map_location,

From 34aa333c44a16c38ce586efc0fb2118da0c20b0e Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Mon, 24 Jul 2023 13:11:06 -0500
Subject: [PATCH 6/9] Last debug

---
 modeling/patches.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modeling/patches.py b/modeling/patches.py
index f5b6bd06..5664ec07 100644
--- a/modeling/patches.py
+++ b/modeling/patches.py
@@ -144,7 +144,6 @@ class LazyloadPatches:
             LazyloadPatches._load_state_dict_into_meta_model
         )
         torch.nn.Module._load_from_state_dict = LazyloadPatches._torch_load_from_state_dict
-        # torch.nn.Module._load_from_state_dict = _agn
 
     def __exit__(exc_type, exc_value, exc_traceback) -> None:
         transformers.modeling_utils._load_state_dict_into_meta_model = LazyloadPatches.old_load_state_dict

From a73420c49c1371c49b59816d3122d6e6d4f3b676 Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Mon, 24 Jul 2023 17:15:59 -0500
Subject: [PATCH 7/9] really really really sketchy breakmodel implementation

im gonna go lie down for an extended period of time
---
 .../inference_models/gptq_hf_torch/class.py   | 175 +++++++++++++++---
 1 file changed, 153 insertions(+), 22 deletions(-)

diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 74f11e18..45d18f7b 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -82,6 +82,79 @@ def get_gptq_version(fpath):
             logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
         return 0, False
 
+def load_quant_offload_device_map(
+    load_quant_func, model, checkpoint, wbits, groupsize, device_map, offload_type=0, force_bias=False,
+):
+    from gptq.offload import (
+        find_layers,
+        llama_offload_forward,
+        gptneox_offload_forward,
+        gptj_offload_forward,
+        opt_offload_forward,
+        bigcode_offload_forward
+    )
+    from transformers.models.llama.modeling_llama import LlamaModel
+    from transformers.models.opt.modeling_opt import OPTModel
+    from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel
+    from transformers.models.gptj.modeling_gptj import GPTJModel
+    from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel
+    model = load_quant_func(model, checkpoint, wbits, groupsize, force_bias=force_bias)
+
+    print(device_map)
+
+    m, layers, remaining = find_layers(model)
+
+    type(m).non_offload_forward = type(m).forward
+
+    # Hook offload_forward into found model
+    if type(m) == LlamaModel:
+        type(m).forward = llama_offload_forward
+    elif type(m) == GPTNeoXModel:
+        type(m).forward = gptneox_offload_forward
+    elif type(m) == GPTJModel:
+        type(m).forward = gptj_offload_forward
+    elif type(m) == OPTModel:
+        type(m).forward = opt_offload_forward
+    elif type(m) == GPTBigCodeModel:
+        type(m).forward = bigcode_offload_forward
+    else:
+        raise RuntimeError(f"Model type {type(m)} not supported by CPU offloader")
+
+    layers_done = len([1 for v in device_map.values() if v != "cpu"])
+    print("LDone", layers_done)
+
+    m.cpu_device = torch.device("cpu")
+    m.fast_offload = layers_done > len(layers) // 2
+    m.layer_count = len(layers)
+    m.cpu_layers = len(layers) - layers_done
+    m.gpu_layers = layers_done
+    m.offload_type = offload_type
+    # HACK
+    m.primary_gpu = list(device_map.values())[0]
+
+    if "layers" not in dir(m):
+        m.layers = layers
+
+    print(len(layers))
+    print(len(device_map))
+
+    print(m.primary_gpu)
+    for i in range(len(layers)):
+        dev = None
+        for key, device in device_map.items():
+            key = int(*[x for x in key.split(".") if x.isdecimal()])
+            if key == i:
+                dev = device
+                break
+        if dev is None:
+            raise ValueError
+        layers[key].to(dev, torch.float16, False)
+
+    for module in remaining:
+        module.to(m.primary_gpu)
+
+    return model
+
 
 class model_backend(HFTorchInferenceModel):
     def is_valid(self, model_name, model_path, menu_path):
@@ -166,7 +239,7 @@ class model_backend(HFTorchInferenceModel):
         self.model.kai_model = self
         utils.koboldai_vars.modeldim = self.get_hidden_size()
 
-    def _patch_quant(self) -> None:
+    def _patch_quant(self, device_map) -> None:
         # QuantLinear loads on the CPU by default, using a lot of RAM! If we
         # load it to the same device that the weights are gonna be on, it
         # mysteriously uses no additional VRAM
@@ -175,14 +248,54 @@ class model_backend(HFTorchInferenceModel):
         from gptq import quant_v2
         from gptq import quant_v1
 
-        def _ql_init_(self, *args, **kwargs):
-            ret = type(self)._unpatched_init(self, *args, **kwargs)
-            self.to("cuda:0")
-            return ret
+        def make_quant(module, names, bits, groupsize, name='', force_bias=False):
+            if isinstance(module, quant_v3.QuantLinear):
+                return
 
-        for quant_module in [quant_v3, quant_v2, quant_v1]:
-            quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__
-            quant_module.QuantLinear.__init__ = _ql_init_
+            for attr in dir(module):
+                tmp = getattr(module, attr)
+                name1 = name + '.' + attr if name != '' else attr
+                if name1 in names:
+                    parts = name1.split(".")
+                    device = None
+                    for i in reversed(range(len(parts))):
+                        maybe_key = ".".join(parts[:i])
+                        if maybe_key in device_map:
+                            device = device_map[maybe_key]
+                            break
+
+                    if device is None:
+                        print(name1)
+                        print(device_map)
+                        raise ValueError
+
+                    print("[ql]", name1, device)
+                    delattr(module, attr)
+
+                    ql = quant_v3.QuantLinear(
+                        bits,
+                        groupsize,
+                        tmp.in_features,
+                        tmp.out_features,
+                        force_bias or tmp.bias is not None
+                    )
+                    ql = ql.to(device)
+
+                    setattr(module, attr, ql)
+
+            for name1, child in module.named_children():
+                make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1, force_bias=force_bias)
+
+        quant_v3.make_quant = make_quant
+
+        # def _ql_init_(self, *args, **kwargs):
+        #     ret = type(self)._unpatched_init(self, *args, **kwargs)
+        #     self.to("cuda:0")
+        #     return ret
+
+        # for quant_module in [quant_v3, quant_v2, quant_v1]:
+        #     quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__
+        #     quant_module.QuantLinear.__init__ = _ql_init_
 
 
     def _get_model(self, location: str, tf_kwargs: Dict):
@@ -193,9 +306,12 @@ class model_backend(HFTorchInferenceModel):
         from gptq.opt import load_quant as opt_load_quant
         from gptq.bigcode import load_quant as bigcode_load_quant
         from gptq.mpt import load_quant as mpt_load_quant
-        from gptq.offload import load_quant_offload
 
-        self._patch_quant()
+        try:
+            import hf_bleeding_edge
+            from hf_bleeding_edge import AutoModelForCausalLM
+        except ImportError:
+            from transformers import AutoModelForCausalLM
 
         gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
         v2_bias = False
@@ -208,22 +324,43 @@ class model_backend(HFTorchInferenceModel):
 
         logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}")
 
+        device_map = {}
+
+        if self.lazy_load:
+            with lazy_loader.use_lazy_load(dematerialized_modules=True):
+                metamodel = AutoModelForCausalLM.from_config(self.model_config)
+                if utils.args.cpu:
+                    device_map = {name: "cpu" for name in utils.layers_module_names}
+                    for name in utils.get_missing_module_names(
+                        metamodel, list(device_map.keys())
+                    ):
+                        device_map[name] = "cpu"
+                else:
+                    device_map = self.breakmodel_config.get_device_map(
+                        metamodel
+                    )
+
+        self._patch_quant(device_map)
+
         with lazy_loader.use_lazy_load(
             enable=self.lazy_load,
             dematerialized_modules=False,
         ):
             if model_type == "gptj":
-                model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+                model = load_quant_offload_device_map(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
             elif model_type == "gpt_neox":
-                model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+                model = load_quant_offload_device_map(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
             elif model_type == "llama":
-                model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+                print("YE LAMA")
+
+                # model = llama_load_quant(location, gptq_file, gptq_bits, gptq_groupsize, force_bias=v2_bias)
+                model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
             elif model_type == "opt":
-                model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+                model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
             elif model_type == "mpt":
-                model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+                model = load_quant_offload_device_map(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
             elif model_type == "gpt_bigcode":
-                model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
+                model = load_quant_offload_device_map(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias).half()
             else:
                 try:
                     import auto_gptq
@@ -231,12 +368,6 @@ class model_backend(HFTorchInferenceModel):
                 except ImportError:
                     raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
 
-                try:
-                    import hf_bleeding_edge
-                    from hf_bleeding_edge import AutoModelForCausalLM
-                except ImportError:
-                    from transformers import AutoModelForCausalLM
-
                 # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
                 auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
                 auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig

From ad4528b5a6882e1bdb46e111be90d6f931090733 Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Mon, 24 Jul 2023 17:17:57 -0500
Subject: [PATCH 8/9] critical change

---
 modeling/inference_models/gptq_hf_torch/class.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 45d18f7b..10349388 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -103,7 +103,6 @@ def load_quant_offload_device_map(
     print(device_map)
 
     m, layers, remaining = find_layers(model)
-
     type(m).non_offload_forward = type(m).forward
 
     # Hook offload_forward into found model

From c80de5120c3bfd28f5a4963eabd562915bc7d015 Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Mon, 24 Jul 2023 19:45:33 -0500
Subject: [PATCH 9/9] Cleanup

---
 .../inference_models/gptq_hf_torch/class.py   | 69 ++++++-------------
 1 file changed, 22 insertions(+), 47 deletions(-)

diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 10349388..6fae6779 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -100,8 +100,6 @@ def load_quant_offload_device_map(
     from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel
     model = load_quant_func(model, checkpoint, wbits, groupsize, force_bias=force_bias)
 
-    print(device_map)
-
     m, layers, remaining = find_layers(model)
     type(m).non_offload_forward = type(m).forward
 
@@ -120,7 +118,6 @@ def load_quant_offload_device_map(
         raise RuntimeError(f"Model type {type(m)} not supported by CPU offloader")
 
     layers_done = len([1 for v in device_map.values() if v != "cpu"])
-    print("LDone", layers_done)
 
     m.cpu_device = torch.device("cpu")
     m.fast_offload = layers_done > len(layers) // 2
@@ -134,10 +131,6 @@ def load_quant_offload_device_map(
     if "layers" not in dir(m):
         m.layers = layers
 
-    print(len(layers))
-    print(len(device_map))
-
-    print(m.primary_gpu)
     for i in range(len(layers)):
         dev = None
         for key, device in device_map.items():
@@ -184,10 +177,6 @@ class model_backend(HFTorchInferenceModel):
         except (ValueError, AttributeError):
             self.gpu_layers_list = [utils.num_layers(self.model_config)]
 
-        tf_kwargs = {
-            "low_cpu_mem_usage": True,
-        }
-
         # If we're using torch_lazy_loader, we need to get breakmodel config
         # early so that it knows where to load the individual model tensors
         logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
@@ -200,9 +189,6 @@ class model_backend(HFTorchInferenceModel):
             self.breakmodel_device_config(self.model_config)
 
         if self.lazy_load:
-            # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
-            tf_kwargs.pop("low_cpu_mem_usage", None)
-
             # If we're using lazy loader, we need to figure out what the model's hidden layers are called
             with lazy_loader.use_lazy_load(dematerialized_modules=True):
                 try:
@@ -218,7 +204,7 @@ class model_backend(HFTorchInferenceModel):
 
         if self.get_local_model_path():
             # Model is stored locally, load it.
-            self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
+            self.model = self._get_model(self.get_local_model_path())
             self.tokenizer = self._get_tokenizer(self.get_local_model_path())
         else:
             raise NotImplementedError("GPTQ Model downloading not implemented")
@@ -238,17 +224,9 @@ class model_backend(HFTorchInferenceModel):
         self.model.kai_model = self
         utils.koboldai_vars.modeldim = self.get_hidden_size()
 
-    def _patch_quant(self, device_map) -> None:
-        # QuantLinear loads on the CPU by default, using a lot of RAM! If we
-        # load it to the same device that the weights are gonna be on, it
-        # mysteriously uses no additional VRAM
-
-        from gptq import quant_v3
-        from gptq import quant_v2
-        from gptq import quant_v1
-
-        def make_quant(module, names, bits, groupsize, name='', force_bias=False):
-            if isinstance(module, quant_v3.QuantLinear):
+    def _patch_quant(self, device_map, quant_module) -> None:
+        def make_quant(module, names, bits, groupsize, name='', force_bias=False, **kwargs):
+            if isinstance(module, quant_module.QuantLinear):
                 return
 
             for attr in dir(module):
@@ -264,19 +242,17 @@ class model_backend(HFTorchInferenceModel):
                             break
 
                     if device is None:
-                        print(name1)
-                        print(device_map)
-                        raise ValueError
+                        raise ValueError(f"No device for {name1}")
 
-                    print("[ql]", name1, device)
                     delattr(module, attr)
 
-                    ql = quant_v3.QuantLinear(
+                    ql = quant_module.QuantLinear(
                         bits,
                         groupsize,
                         tmp.in_features,
                         tmp.out_features,
-                        force_bias or tmp.bias is not None
+                        force_bias or tmp.bias is not None,
+                        **kwargs,
                     )
                     ql = ql.to(device)
 
@@ -285,19 +261,21 @@ class model_backend(HFTorchInferenceModel):
             for name1, child in module.named_children():
                 make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1, force_bias=force_bias)
 
-        quant_v3.make_quant = make_quant
-
-        # def _ql_init_(self, *args, **kwargs):
-        #     ret = type(self)._unpatched_init(self, *args, **kwargs)
-        #     self.to("cuda:0")
-        #     return ret
-
-        # for quant_module in [quant_v3, quant_v2, quant_v1]:
-        #     quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__
-        #     quant_module.QuantLinear.__init__ = _ql_init_
+        quant_module.make_quant = make_quant
 
 
-    def _get_model(self, location: str, tf_kwargs: Dict):
+    def _patch_quants(self, device_map) -> None:
+        # Load QuantLinears on the device corresponding to the device map
+
+        from gptq import quant_v3
+        from gptq import quant_v2
+        from gptq import quant_v1
+
+        for quant_module in [quant_v3, quant_v2, quant_v1]:
+            self._patch_quant(device_map, quant_module)
+
+
+    def _get_model(self, location: str):
         import gptq
         from gptq.gptj import load_quant as gptj_load_quant
         from gptq.gptneox import load_quant as gptneox_load_quant
@@ -339,7 +317,7 @@ class model_backend(HFTorchInferenceModel):
                         metamodel
                     )
 
-        self._patch_quant(device_map)
+        self._patch_quants(device_map)
 
         with lazy_loader.use_lazy_load(
             enable=self.lazy_load,
@@ -350,9 +328,6 @@ class model_backend(HFTorchInferenceModel):
             elif model_type == "gpt_neox":
                 model = load_quant_offload_device_map(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
             elif model_type == "llama":
-                print("YE LAMA")
-
-                # model = llama_load_quant(location, gptq_file, gptq_bits, gptq_groupsize, force_bias=v2_bias)
                 model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
             elif model_type == "opt":
                 model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)