From fd6f66a98da4655773ae02e39e2299da36818c7e Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Sat, 8 Jul 2023 13:57:05 -0500
Subject: [PATCH 1/3] Patch _rebuild_from_type_v2 to not try converting
 LazyTensors to Tensors

---
 modeling/lazy_loader.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py
index da2ec989..2af0ae51 100644
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -57,6 +57,7 @@ import _codecs
 import os
 from typing import Any, Callable, Dict, Optional, Tuple, Type
 
+from torch import Tensor
 from torch.nn import Module
 from torch.storage import UntypedStorage
 
@@ -237,6 +238,29 @@ class SafetensorsLazyTensor(LazyTensor):
             self.checkpoint_file, tensor_key=self.key, device=self.location
         )
 
+def _patched_rebuild_from_type_v2(func, new_type, args, state):
+    """A patched version of torch._tensor._rebuild_from_type_v2 that
+    does not attempt to convert `LazyTensor`s to `torch.Tensor`s."""
+
+    ret = func(*args)
+
+    # BEGIN PATCH
+    transformation_ok = isinstance(ret, LazyTensor) and new_type == Tensor
+    if type(ret) is not new_type and not transformation_ok:
+    # END PATCH
+        ret = ret.as_subclass(new_type)
+
+    # Tensor does define __setstate__ even though it doesn't define
+    # __getstate__. So only use __setstate__ if it is NOT the one defined
+    # on Tensor
+    if (
+        getattr(ret.__class__, "__setstate__", Tensor.__setstate__)
+        is not Tensor.__setstate__
+    ):
+        ret.__setstate__(state)
+    else:
+        ret = torch._utils._set_obj_state(ret, state)
+    return ret
 
 class RestrictedUnpickler(pickle.Unpickler):
     def original_persistent_load(self, saved_id):
@@ -253,7 +277,7 @@ class RestrictedUnpickler(pickle.Unpickler):
         elif module == "torch._utils" and name == "_rebuild_tensor_v2":
             return torch._utils._rebuild_tensor_v2
         elif module == "torch._tensor" and name == "_rebuild_from_type_v2":
-            return torch._tensor._rebuild_from_type_v2
+            return _patched_rebuild_from_type_v2
         elif module == "torch" and name in (
             "DoubleStorage",
             "FloatStorage",

From c2ee30af32c68e3915585feea3d9d19c8ae70e4c Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Sat, 8 Jul 2023 14:04:46 -0500
Subject: [PATCH 2/3] Add --panic to raise when loading fails

---
 aiserver.py                                         | 1 +
 modeling/inference_models/generic_hf_torch/class.py | 2 ++
 modeling/inference_models/hf_torch.py               | 5 +++++
 3 files changed, 8 insertions(+)

diff --git a/aiserver.py b/aiserver.py
index 49223b3a..74b8bca8 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1400,6 +1400,7 @@ def general_startup(override_args=None):
     parser.add_argument('-f', action='store', help="option for compatability with colab memory profiles")
     parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen")
     parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen")
+    parser.add_argument("--panic", action='store_true', help="Disables falling back when loading fails.")
 
     #args: argparse.Namespace = None
     if "pytest" in sys.modules and override_args is None:
diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py
index ad17b85b..8f024ea1 100644
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -90,6 +90,8 @@ class model_backend(HFTorchInferenceModel):
                     utils.module_names = list(metamodel.state_dict().keys())
                     utils.named_buffers = list(metamodel.named_buffers(recurse=True))
                 except Exception as e:
+                    if utils.args.panic:
+                        raise e
                     logger.warning(f"Gave up on lazy loading due to {e}")
                     self.lazy_load = False
 
diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py
index 84d3447e..2249a87a 100644
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -363,6 +363,8 @@ class HFTorchInferenceModel(HFInferenceModel):
                 return GPTNeoForCausalLM.from_pretrained(location, **tf_kwargs)
         except Exception as e:
             logger.warning(f"{self.model_name} is a no-go; {e} - Falling back to auto.")
+            if utils.args.panic:
+                raise e
 
         # Try to determine model type from either AutoModel or falling back to legacy
         try:
@@ -414,6 +416,9 @@ class HFTorchInferenceModel(HFInferenceModel):
                 logger.error("Invalid load key! Aborting.")
                 raise
 
+            if utils.args.panic:
+                raise e
+
             logger.warning(f"Fell back to GPT2LMHeadModel due to {e}")
             logger.debug(traceback.format_exc())
 

From 3928d86339b492466594a22c521a0d68f08024b3 Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Sat, 8 Jul 2023 14:36:45 -0500
Subject: [PATCH 3/3] Fall back to unpatched HF

---
 modeling/inference_models/hf_torch.py | 31 ++++++++++----
 modeling/lazy_loader.py               |  4 ++
 modeling/patches.py                   | 60 +++++++++++++++------------
 3 files changed, 61 insertions(+), 34 deletions(-)

diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py
index 2249a87a..fb9fe39e 100644
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -364,7 +364,7 @@ class HFTorchInferenceModel(HFInferenceModel):
         except Exception as e:
             logger.warning(f"{self.model_name} is a no-go; {e} - Falling back to auto.")
             if utils.args.panic:
-                raise e
+                raise
 
         # Try to determine model type from either AutoModel or falling back to legacy
         try:
@@ -383,11 +383,28 @@ class HFTorchInferenceModel(HFInferenceModel):
                             metamodel
                         )
 
-            with lazy_loader.use_lazy_load(
-                enable=self.lazy_load,
-                # DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!!
-                dematerialized_modules=False,
-            ):
+            try:
+                # Try to load with the lazyloader first...
+                with lazy_loader.use_lazy_load(
+                    enable=self.lazy_load,
+                    # DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!!
+                    dematerialized_modules=False,
+                ):
+                    model = AutoModelForCausalLM.from_pretrained(
+                        location,
+                        offload_folder="accelerate-disk-cache",
+                        torch_dtype=self._get_target_dtype(),
+                        **tf_kwargs,
+                    )
+            except Exception as e:
+                # ...but fall back to stock HF if lazyloader fails.
+                if utils.args.panic:
+                    raise
+                logger.error("Lazyloader failed, falling back to stock HF load. You may run out of RAM here. Details:")
+                logger.error(e)
+                logger.error(traceback.format_exc())
+                logger.info("Falling back to stock HF load...")
+
                 model = AutoModelForCausalLM.from_pretrained(
                     location,
                     offload_folder="accelerate-disk-cache",
@@ -417,7 +434,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                 raise
 
             if utils.args.panic:
-                raise e
+                raise
 
             logger.warning(f"Fell back to GPT2LMHeadModel due to {e}")
             logger.debug(traceback.format_exc())
diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py
index 2af0ae51..69e0d948 100644
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -60,6 +60,7 @@ from typing import Any, Callable, Dict, Optional, Tuple, Type
 from torch import Tensor
 from torch.nn import Module
 from torch.storage import UntypedStorage
+from modeling.patches import LazyloadPatches
 
 # Safetensors is a dependency for the local version, TPU/Colab doesn't
 # support it yet.
@@ -510,6 +511,8 @@ def use_lazy_load(
     begin_time = time.time()
 
     try:
+        LazyloadPatches.__enter__()
+
         old_rebuild_tensor = torch._utils._rebuild_tensor
         torch._utils._rebuild_tensor = _rebuild_tensor
 
@@ -577,6 +580,7 @@ def use_lazy_load(
             yield True
 
     finally:
+        LazyloadPatches.__exit__(None, None, None)
         torch._utils._rebuild_tensor = old_rebuild_tensor
         torch.load = old_torch_load
 
diff --git a/modeling/patches.py b/modeling/patches.py
index 827e997a..a72d533a 100644
--- a/modeling/patches.py
+++ b/modeling/patches.py
@@ -10,7 +10,9 @@ from transformers import (
     PreTrainedModel,
     modeling_utils,
 )
-from modeling.lazy_loader import LazyTensor
+
+import torch
+import modeling
 
 import utils
 
@@ -126,27 +128,16 @@ def patch_transformers_generation() -> None:
     transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ = new_init
 
 
-def patch_transformers_for_lazyload() -> None:
-    """
-    Most of the code is modified code from the Accelerate and Transformers
-    projects, made by HuggingFace. The license for these projects are as follows:
-    ---
-    Copyright The HuggingFace Team. All rights reserved.
+class LazyloadPatches:
+    old_load_state_dict = transformers.modeling_utils._load_state_dict_into_meta_model
 
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
+    def __enter__() -> None:
+        transformers.modeling_utils._load_state_dict_into_meta_model = (
+            LazyloadPatches._load_state_dict_into_meta_model
+        )
 
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-    """
-    import torch
-    from accelerate.utils import set_module_tensor_to_device, offload_weight
+    def __exit__(exc_type, exc_value, exc_traceback) -> None:
+        transformers.modeling_utils._load_state_dict_into_meta_model = LazyloadPatches.old_load_state_dict
 
     def _load_state_dict_into_meta_model(
         model,
@@ -167,6 +158,26 @@ def patch_transformers_for_lazyload() -> None:
         is_safetensors=False,
         keep_in_fp32_modules=None,
     ):
+        """
+        This is modified code from the Accelerate and Transformers projects,
+        made by HuggingFace. The license for these projects are as follows:
+        ---
+        Copyright The HuggingFace Team. All rights reserved.
+
+        Licensed under the Apache License, Version 2.0 (the "License");
+        you may not use this file except in compliance with the License.
+        You may obtain a copy of the License at
+
+            http://www.apache.org/licenses/LICENSE-2.0
+
+        Unless required by applicable law or agreed to in writing, software
+        distributed under the License is distributed on an "AS IS" BASIS,
+        WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+        See the License for the specific language governing permissions and
+        limitations under the License.
+        """
+        from accelerate.utils import offload_weight, set_module_tensor_to_device
+
         is_quantized = is_quantized or load_in_8bit
 
         if is_quantized:
@@ -201,7 +212,7 @@ def patch_transformers_for_lazyload() -> None:
             ),
         ):
 
-            if isinstance(param, LazyTensor):
+            if isinstance(param, modeling.lazy_loader.LazyTensor):
                 # Should always be true
                 param = param.materialize(map_location="cpu")
             utils.bar.update(1)
@@ -296,15 +307,10 @@ def patch_transformers_for_lazyload() -> None:
 
         return error_msgs, offload_index, state_dict_index
 
-    transformers.modeling_utils._load_state_dict_into_meta_model = (
-        _load_state_dict_into_meta_model
-    )
-
 
 def patch_transformers(use_tpu: bool) -> None:
     patch_transformers_download()
     patch_transformers_loader()
 
     if not use_tpu:
-        patch_transformers_generation()
-        patch_transformers_for_lazyload()
+        patch_transformers_generation()
\ No newline at end of file