From e278abd7c94b1b734c1757e34fa3ea21dae22f70 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Wed, 30 Nov 2022 18:56:22 -0500
Subject: [PATCH] Copy of VE's 8 bit lazyloading/breakmodel code

---
 aiserver.py          | 18 +++++++++---------
 torch_lazy_loader.py | 32 ++++++++++++++++++++++++--------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index b3604785..3598578b 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2465,10 +2465,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
     
     if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features:
         use_8_bit = False
-    if use_8_bit:
-        koboldai_vars.lazy_load = False
-        koboldai_vars.breakmodel = False
-    logger.info("koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load))
     if(initial_load):
         use_breakmodel_args = True
     if not utils.HAS_ACCELERATE:
@@ -2680,7 +2676,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
             # Lazy loader
             import torch_lazy_loader
             def get_lazy_load_callback(n_layers, convert_to_float16=True):
-                logger.info("In Callback - koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load))
                 if not koboldai_vars.lazy_load:
                     return
 
@@ -2742,6 +2737,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         koboldai_vars.loaded_layers = 0
                         utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=Send_to_socketio())
 
+                    if koboldai_vars.bit_8_available:
+                        import bitsandbytes as bnb
                     with zipfile.ZipFile(f, "r") as z:
                         try:
                             last_storage_key = None
@@ -2769,8 +2766,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
                                 if model_dict[key].dtype is torch.float32:
                                     koboldai_vars.fp32_model = True
-                                if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
-                                    model_dict[key] = model_dict[key].to(torch.float16)
+                                if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and any(model_dict[key].dtype is t for t in (torch.float32, torch.float16)):
+                                    if use_8_bit:
+                                        model_dict[key] = bnb.nn.Int8Params(model_dict[key].to(torch.float16), requires_grad=False, has_fp16_weights=False).to(device if device not in ("shared", "disk") else "cpu")
+                                    else:
+                                        model_dict[key] = model_dict[key].to(torch.float16)
                                 if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
                                     model_dict[key] = model_dict[key].to(torch.float32)
                                 if device == "shared":
@@ -2882,7 +2882,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     import shutil
                     shutil.move(koboldai_vars.model.replace('/', '_'), "models/{}".format(koboldai_vars.model.replace('/', '_')))
                 if(koboldai_vars.lazy_load):  # If we're using lazy loader, we need to figure out what the model's hidden layers are called
-                    with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True):
+                    with torch_lazy_loader.use_lazy_torch_load(bit_8_available=koboldai_vars.bit_8_available, dematerialized_modules=True, use_accelerate_init_empty_weights=True):
                         try:
                             metamodel = AutoModelForCausalLM.from_config(model_config)
                         except Exception as e:
@@ -2890,7 +2890,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         utils.layers_module_names = utils.get_layers_module_names(metamodel)
                         utils.module_names = list(metamodel.state_dict().keys())
                         utils.named_buffers = list(metamodel.named_buffers(recurse=True))
-                with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=koboldai_vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if koboldai_vars.lazy_load else None, dematerialized_modules=True):
+                with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(bit_8_available=koboldai_vars.bit_8_available, enable=koboldai_vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if koboldai_vars.lazy_load else None, dematerialized_modules=True):
                     if(koboldai_vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                         lowmem = {}
                     if(os.path.isdir(koboldai_vars.custmodpth)):
diff --git a/torch_lazy_loader.py b/torch_lazy_loader.py
index 1298335d..8cd7cc50 100644
--- a/torch_lazy_loader.py
+++ b/torch_lazy_loader.py
@@ -225,7 +225,11 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, miss
             try:
                 with torch.no_grad():
                     #param.copy_(input_param)
-                    new_param = torch.nn.Parameter(input_param, requires_grad=param.requires_grad)  # This line is new
+                    import bitsandbytes as bnb  # This line is new
+                    if isinstance(input_param, bnb.nn.Int8Params):  # This line is new
+                        new_param = input_param  # This line is new
+                    else:  # This line is new
+                        new_param = torch.nn.Parameter(input_param, requires_grad=param.requires_grad)  # This line is new
                     if name in self._parameters:  # This line is new
                         self._parameters[name] = new_param  # This line is new
                     if name in persistent_buffers:  # This line is new
@@ -277,7 +281,7 @@ def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler
         pickle.load = old_pickle_load
 
 @contextlib.contextmanager
-def use_lazy_torch_load(enable=True, callback: Optional[Callable] = None, dematerialized_modules=False, use_accelerate_init_empty_weights=False):
+def use_lazy_torch_load(bit_8_available=False, enable=True, callback: Optional[Callable] = None, dematerialized_modules=False, use_accelerate_init_empty_weights=False):
     if not enable:
         with use_custom_unpickler(RestrictedUnpickler):
             yield False
@@ -298,17 +302,30 @@ def use_lazy_torch_load(enable=True, callback: Optional[Callable] = None, demate
         torch.load = torch_load
 
         if dematerialized_modules:
+            old_linear_init = torch.nn.Linear.__init__
+            old_embedding_init = torch.nn.Embedding.__init__
+            old_layernorm_init = torch.nn.LayerNorm.__init__
+            old_load_from_state_dict = torch.nn.Module._load_from_state_dict
             if use_accelerate_init_empty_weights and utils.HAS_ACCELERATE:
                 import accelerate
                 init_empty_weights = accelerate.init_empty_weights()
                 init_empty_weights.__enter__()
             else:
-                old_linear_init = torch.nn.Linear.__init__
-                old_embedding_init = torch.nn.Embedding.__init__
-                old_layernorm_init = torch.nn.LayerNorm.__init__
-
+                import accelerate
+                if bit_8_available:
+                    import bitsandbytes as bnb
+                
                 def linear_init(self, *args, device=None, **kwargs):
-                    return old_linear_init(self, *args, device="meta", **kwargs)
+                    if linear_init.nested_flag or not bit_8_available:
+                        return old_linear_init(self, *args, device=device, **kwargs)
+                    linear_init.nested_flag = True
+                    try:
+                        self.__class__ = bnb.nn.Linear8bitLt
+                        with accelerate.init_empty_weights():
+                            return bnb.nn.Linear8bitLt.__init__(self, *args, has_fp16_weights=False, threshold=6.0, **kwargs)
+                    finally:
+                        linear_init.nested_flag = False
+                linear_init.nested_flag = False
 
                 def embedding_init(self, *args, device=None, **kwargs):
                     return old_embedding_init(self, *args, device="meta", **kwargs)
@@ -319,7 +336,6 @@ def use_lazy_torch_load(enable=True, callback: Optional[Callable] = None, demate
                 torch.nn.Linear.__init__ = linear_init
                 torch.nn.Embedding.__init__ = embedding_init
                 torch.nn.LayerNorm.__init__ = layernorm_init
-                old_load_from_state_dict = torch.nn.Module._load_from_state_dict
                 torch.nn.Module._load_from_state_dict = _load_from_state_dict
 
         with use_custom_unpickler(_LazyUnpickler):