From 9e53bcf67684198bbbaeb3e67281c1641419f448 Mon Sep 17 00:00:00 2001 From: ebolam Date: Mon, 22 May 2023 20:24:57 -0400 Subject: [PATCH 1/2] Fix for breakmodel loading to CPU when set to GPU --- modeling/inference_models/generic_hf_torch/class.py | 8 +++++--- modeling/inference_models/hf.py | 6 ++++-- modeling/inference_models/hf_torch.py | 3 +++ static/custom.css | 5 +++++ 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 4e2c8a5b..572337e2 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -248,11 +248,12 @@ class model_backend(HFTorchInferenceModel): self.patch_embedding() + if utils.koboldai_vars.hascuda: - if utils.koboldai_vars.usegpu: + if self.usegpu: # Use just VRAM self.model = self.model.half().to(utils.koboldai_vars.gpu_device) - elif utils.koboldai_vars.breakmodel: + elif self.breakmodel: # Use both RAM and VRAM (breakmodel) if not self.lazy_load: self.breakmodel_device_config(self.model.config) @@ -267,7 +268,8 @@ class model_backend(HFTorchInferenceModel): self._move_to_devices() else: self.model = self.model.to("cpu").float() - + + self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 53c802b1..e801eab2 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -158,7 +158,7 @@ class HFInferenceModel(InferenceModel): layers.append(None) else: layers.append(parameters["{}_Layers".format(i)]) - self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None + self.cpu_layers = int(parameters['CPU_Layers']) if 'CPU_Layers' in parameters else None if isinstance(self.cpu_layers, str): self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0 self.layers = layers @@ -167,9 +167,11 @@ class HFInferenceModel(InferenceModel): self.disk_layers = int(self.disk_layers) if self.disk_layers.isnumeric() else 0 breakmodel.gpu_blocks = layers breakmodel.disk_blocks = self.disk_layers - self.usegpu = parameters['use_gpu'] if 'use_gpu' in parameters else None + self.usegpu = self.cpu_layers == 0 and breakmodel.disk_blocks == 0 and sum(self.layers)-self.layers[0] == 0 self.model_type = self.get_model_type() self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel + else: + self.usegpu = parameters['use_gpu'] if 'use_gpu' in parameters else None self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id'] self.path = parameters['path'] if 'path' in parameters else None diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 5595edc7..c5560360 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -126,6 +126,7 @@ class HFTorchInferenceModel(HFInferenceModel): return "Unknown" def _post_load(m_self) -> None: + if not utils.koboldai_vars.model_type: utils.koboldai_vars.model_type = m_self.get_model_type() @@ -562,6 +563,7 @@ class HFTorchInferenceModel(HFInferenceModel): ) ) # print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) + #logger.debug(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ") model_dict[key] = model_dict[key].materialize( f, map_location="cpu" ) @@ -847,6 +849,7 @@ class HFTorchInferenceModel(HFInferenceModel): # If all layers are on the same device, use the old GPU generation mode while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0: breakmodel.gpu_blocks.pop() + self.breakmodel = True if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in ( -1, utils.num_layers(config), diff --git a/static/custom.css b/static/custom.css index 412c7f1b..968d73e4 100644 --- a/static/custom.css +++ b/static/custom.css @@ -2404,4 +2404,9 @@ body.connected .popupfooter, .popupfooter.always-available { padding: 5px; padding-right: 0px; padding-top: 0px; +} + +.input_error { + border: 5px solid red !important; + box-sizing: border-box !important; } \ No newline at end of file From 4c25d6fbbbfad67176056a6f5af1826c2c2eb24c Mon Sep 17 00:00:00 2001 From: ebolam Date: Mon, 22 May 2023 20:34:01 -0400 Subject: [PATCH 2/2] Fix for loading model multiple times loosing the gpu/cpu splits --- modeling/inference_models/hf.py | 6 ------ modeling/inference_models/hf_torch.py | 3 +++ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index e801eab2..b50ebf56 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -197,12 +197,6 @@ class HFInferenceModel(InferenceModel): torch.cuda.empty_cache() except: pass - if self.hf_torch: - if 'breakmodel' in sys.modules: - import breakmodel - breakmodel.breakmodel = True - breakmodel.gpu_blocks = [] - breakmodel.disk_blocks = 0 def _post_load(self) -> None: # These are model specific tokenizer overrides if a model has bad defaults diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index c5560360..681d3ab1 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -788,6 +788,7 @@ class HFTorchInferenceModel(HFInferenceModel): if device_count < 2: primary = None logger.debug("n_layers: {}".format(n_layers)) + logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks)) gpu_blocks = breakmodel.gpu_blocks + ( device_count - len(breakmodel.gpu_blocks) ) * [0] @@ -818,6 +819,8 @@ class HFTorchInferenceModel(HFInferenceModel): n_layers = utils.num_layers(config) + logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks)) + if utils.args.cpu: breakmodel.gpu_blocks = [0] * n_layers return