From 9e53bcf67684198bbbaeb3e67281c1641419f448 Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Mon, 22 May 2023 20:24:57 -0400
Subject: [PATCH 1/2] Fix for breakmodel loading to CPU when set to GPU

---
 modeling/inference_models/generic_hf_torch/class.py | 8 +++++---
 modeling/inference_models/hf.py                     | 6 ++++--
 modeling/inference_models/hf_torch.py               | 3 +++
 static/custom.css                                   | 5 +++++
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py
index 4e2c8a5b..572337e2 100644
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -248,11 +248,12 @@ class model_backend(HFTorchInferenceModel):
 
         self.patch_embedding()
 
+        
         if utils.koboldai_vars.hascuda:
-            if utils.koboldai_vars.usegpu:
+            if self.usegpu:
                 # Use just VRAM
                 self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
-            elif utils.koboldai_vars.breakmodel:
+            elif self.breakmodel:
                 # Use both RAM and VRAM (breakmodel)
                 if not self.lazy_load:
                     self.breakmodel_device_config(self.model.config)
@@ -267,7 +268,8 @@ class model_backend(HFTorchInferenceModel):
             self._move_to_devices()
         else:
             self.model = self.model.to("cpu").float()
-
+        
+        
         self.model.kai_model = self
         utils.koboldai_vars.modeldim = self.get_hidden_size()
 
diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index 53c802b1..e801eab2 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -158,7 +158,7 @@ class HFInferenceModel(InferenceModel):
                          layers.append(None)
                     else:
                         layers.append(parameters["{}_Layers".format(i)])
-                self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None
+                self.cpu_layers = int(parameters['CPU_Layers']) if 'CPU_Layers' in parameters else None
                 if isinstance(self.cpu_layers, str):
                     self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0
                 self.layers = layers
@@ -167,9 +167,11 @@ class HFInferenceModel(InferenceModel):
                     self.disk_layers = int(self.disk_layers) if self.disk_layers.isnumeric() else 0
                 breakmodel.gpu_blocks = layers
                 breakmodel.disk_blocks = self.disk_layers
-            self.usegpu = parameters['use_gpu'] if 'use_gpu' in parameters else None
+                self.usegpu = self.cpu_layers == 0 and breakmodel.disk_blocks == 0 and sum(self.layers)-self.layers[0] == 0
             self.model_type = self.get_model_type()
             self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel
+        else:
+            self.usegpu = parameters['use_gpu'] if 'use_gpu' in parameters else None
         self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
         self.path = parameters['path'] if 'path' in parameters else None
 
diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py
index 5595edc7..c5560360 100644
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -126,6 +126,7 @@ class HFTorchInferenceModel(HFInferenceModel):
             return "Unknown"
 
     def _post_load(m_self) -> None:
+
         if not utils.koboldai_vars.model_type:
             utils.koboldai_vars.model_type = m_self.get_model_type()
 
@@ -562,6 +563,7 @@ class HFTorchInferenceModel(HFInferenceModel):
                                 )
                             )
                             # print(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
+                            #logger.debug(f"Transferring <{key}>  to  {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ")
                             model_dict[key] = model_dict[key].materialize(
                                 f, map_location="cpu"
                             )
@@ -847,6 +849,7 @@ class HFTorchInferenceModel(HFInferenceModel):
         # If all layers are on the same device, use the old GPU generation mode
         while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
             breakmodel.gpu_blocks.pop()
+        self.breakmodel = True
         if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (
             -1,
             utils.num_layers(config),
diff --git a/static/custom.css b/static/custom.css
index 412c7f1b..968d73e4 100644
--- a/static/custom.css
+++ b/static/custom.css
@@ -2404,4 +2404,9 @@ body.connected .popupfooter, .popupfooter.always-available {
 	padding: 5px;
 	padding-right: 0px;
 	padding-top: 0px;
+}
+
+.input_error {
+	border: 5px solid red !important;
+	box-sizing: border-box !important;
 }
\ No newline at end of file

From 4c25d6fbbbfad67176056a6f5af1826c2c2eb24c Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Mon, 22 May 2023 20:34:01 -0400
Subject: [PATCH 2/2] Fix for loading model multiple times loosing the gpu/cpu
 splits

---
 modeling/inference_models/hf.py       | 6 ------
 modeling/inference_models/hf_torch.py | 3 +++
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index e801eab2..b50ebf56 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -197,12 +197,6 @@ class HFInferenceModel(InferenceModel):
                 torch.cuda.empty_cache()
         except:
             pass
-        if self.hf_torch:
-            if 'breakmodel' in sys.modules:
-                import breakmodel
-                breakmodel.breakmodel = True
-                breakmodel.gpu_blocks = []
-                breakmodel.disk_blocks = 0
 
     def _post_load(self) -> None:
         # These are model specific tokenizer overrides if a model has bad defaults
diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py
index c5560360..681d3ab1 100644
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -788,6 +788,7 @@ class HFTorchInferenceModel(HFInferenceModel):
         if device_count < 2:
             primary = None
         logger.debug("n_layers: {}".format(n_layers))
+        logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks))
         gpu_blocks = breakmodel.gpu_blocks + (
             device_count - len(breakmodel.gpu_blocks)
         ) * [0]
@@ -818,6 +819,8 @@ class HFTorchInferenceModel(HFInferenceModel):
 
         n_layers = utils.num_layers(config)
 
+        logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks))
+
         if utils.args.cpu:
             breakmodel.gpu_blocks = [0] * n_layers
             return