From 1df03d9a27b86a086abafe80aefc0db67aa8e3f0 Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Sun, 23 Jul 2023 20:54:04 -0500
Subject: [PATCH] Basic

---
 .../inference_models/gptq_hf_torch/class.py   | 94 +++++++++++--------
 modeling/lazy_loader.py                       |  5 +
 modeling/patches.py                           | 19 ++++
 3 files changed, 79 insertions(+), 39 deletions(-)

diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 81a33c70..9a1b872e 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -89,6 +89,12 @@ class model_backend(HFTorchInferenceModel):
         return bool(gptq_model)
 
     def _load(self, save_model: bool, initial_load: bool) -> None:
+        try:
+            import hf_bleeding_edge
+            from hf_bleeding_edge import AutoModelForCausalLM
+        except ImportError:
+            from transformers import AutoModelForCausalLM
+
         # Make model path the same as the model name to make this consistent
         # with the other loading method if it isn't a known model type. This
         # code is not just a workaround for below, it is also used to make the
@@ -98,7 +104,7 @@ class model_backend(HFTorchInferenceModel):
 
         self.init_model_config()
 
-        self.lazy_load = False
+        self.lazy_load = True
 
         gpulayers = self.breakmodel_config.gpu_blocks
 
@@ -181,50 +187,60 @@ class model_backend(HFTorchInferenceModel):
         model_type = self.get_model_type()
 
         logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}")
-        if model_type == "gptj":
-            model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "gpt_neox":
-            model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "llama":
-            model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "opt":
-            model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "mpt":
-            model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "gpt_bigcode":
-            model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
-        else:
-            try:
-                import auto_gptq
-                from auto_gptq import AutoGPTQForCausalLM
-            except ImportError:
-                raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
 
-            try:
-                import hf_bleeding_edge
-                from hf_bleeding_edge import AutoModelForCausalLM
-            except ImportError:
-                from transformers import AutoModelForCausalLM
 
-            # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
-            auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
-            auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
-            auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM
-            model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"))
+        with lazy_loader.use_lazy_load(
+            enable=self.lazy_load,
+            dematerialized_modules=False,
+        ):
+            print(self.lazy_load)
+            if model_type == "gptj":
+                model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+            elif model_type == "gpt_neox":
+                model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+            elif model_type == "llama":
+                print("LLLLLAAAMMMAA")
+                print(torch.load)
+                model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+            elif model_type == "opt":
+                model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+            elif model_type == "mpt":
+                model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
+            elif model_type == "gpt_bigcode":
+                model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
+            else:
+                try:
+                    import auto_gptq
+                    from auto_gptq import AutoGPTQForCausalLM
+                except ImportError:
+                    raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
 
-            # Patch in embeddings function
-            def get_input_embeddings(self):
-                return self.model.get_input_embeddings()
+                try:
+                    import hf_bleeding_edge
+                    from hf_bleeding_edge import AutoModelForCausalLM
+                except ImportError:
+                    from transformers import AutoModelForCausalLM
 
-            type(model).get_input_embeddings = get_input_embeddings
+                # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
+                auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
+                auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
+                auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM
 
-            # Patch in args support..
-            def generate(self, *args, **kwargs):
-                """shortcut for model.generate"""
-                with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
-                    return self.model.generate(*args, **kwargs)
+                model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"))
 
-            type(model).generate = generate
+                # Patch in embeddings function
+                def get_input_embeddings(self):
+                    return self.model.get_input_embeddings()
+
+                type(model).get_input_embeddings = get_input_embeddings
+
+                # Patch in args support..
+                def generate(self, *args, **kwargs):
+                    """shortcut for model.generate"""
+                    with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
+                        return self.model.generate(*args, **kwargs)
+
+                type(model).generate = generate
 
         return model
 
diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py
index 69e0d948..8fff59d3 100644
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -358,16 +358,19 @@ def safetensors_load_tensor_independently(
 ) -> torch.Tensor:
     """A hacky way to load a tensor by itself and not mmap every single tensor
     or whatever is causing that big memory spike"""
+    print("[ld]", tensor_key)
 
     with safetensors.safe_open(checkpoint_file, framework="pt", device=device) as f:
         return f.get_tensor(tensor_key)
 
 
 def patch_safetensors(callback):
+    print("Hi! We are patching safetensors")
     # Safetensors load patch
     import transformers
 
     def safetensors_load(checkpoint_file: str) -> dict:
+        print("LOAD NOW", safetensors_load)
         # Monkeypatch applied to safetensors.torch.load_file
 
         if utils.koboldai_vars.hascuda:
@@ -409,6 +412,7 @@ def patch_safetensors(callback):
         return tensors
 
     transformers.modeling_utils.safe_load_file = safetensors_load
+    safetensors.torch.load_file = safetensors_load
 
 
 @contextlib.contextmanager
@@ -520,6 +524,7 @@ def use_lazy_load(
         old_torch_load = torch.load
 
         def torch_load(f, map_location=None, pickle_module=pickle, **pickle_load_args):
+            print("TORCHLOAD", f)
             model_dict = old_torch_load(
                 f=f,
                 map_location=map_location,
diff --git a/modeling/patches.py b/modeling/patches.py
index 6e2168f2..f5b6bd06 100644
--- a/modeling/patches.py
+++ b/modeling/patches.py
@@ -129,15 +129,34 @@ def patch_transformers_generation() -> None:
 
 
 class LazyloadPatches:
+    class StateDictFacade(dict):
+        def __init__(self, state_dict):
+            self.update(state_dict)
+
+        def __getitem__(self, name):
+            return super().__getitem__(name).materialize(map_location="cuda:0")
+
     old_load_state_dict = transformers.modeling_utils._load_state_dict_into_meta_model
+    torch_old_load_from_state_dict = torch.nn.Module._load_from_state_dict
 
     def __enter__() -> None:
         transformers.modeling_utils._load_state_dict_into_meta_model = (
             LazyloadPatches._load_state_dict_into_meta_model
         )
+        torch.nn.Module._load_from_state_dict = LazyloadPatches._torch_load_from_state_dict
+        # torch.nn.Module._load_from_state_dict = _agn
 
     def __exit__(exc_type, exc_value, exc_traceback) -> None:
         transformers.modeling_utils._load_state_dict_into_meta_model = LazyloadPatches.old_load_state_dict
+        torch.nn.Module._load_from_state_dict = LazyloadPatches.torch_old_load_from_state_dict
+
+    def _torch_load_from_state_dict(self, state_dict, *args, **kwargs):
+        return LazyloadPatches.torch_old_load_from_state_dict(
+            self,
+            LazyloadPatches.StateDictFacade(state_dict),
+            *args,
+            **kwargs
+        )
 
     def _load_state_dict_into_meta_model(
         model,