diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 4e2c8a5b..572337e2 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -248,11 +248,12 @@ class model_backend(HFTorchInferenceModel): self.patch_embedding() + if utils.koboldai_vars.hascuda: - if utils.koboldai_vars.usegpu: + if self.usegpu: # Use just VRAM self.model = self.model.half().to(utils.koboldai_vars.gpu_device) - elif utils.koboldai_vars.breakmodel: + elif self.breakmodel: # Use both RAM and VRAM (breakmodel) if not self.lazy_load: self.breakmodel_device_config(self.model.config) @@ -267,7 +268,8 @@ class model_backend(HFTorchInferenceModel): self._move_to_devices() else: self.model = self.model.to("cpu").float() - + + self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 53c802b1..e801eab2 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -158,7 +158,7 @@ class HFInferenceModel(InferenceModel): layers.append(None) else: layers.append(parameters["{}_Layers".format(i)]) - self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None + self.cpu_layers = int(parameters['CPU_Layers']) if 'CPU_Layers' in parameters else None if isinstance(self.cpu_layers, str): self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0 self.layers = layers @@ -167,9 +167,11 @@ class HFInferenceModel(InferenceModel): self.disk_layers = int(self.disk_layers) if self.disk_layers.isnumeric() else 0 breakmodel.gpu_blocks = layers breakmodel.disk_blocks = self.disk_layers - self.usegpu = parameters['use_gpu'] if 'use_gpu' in parameters else None + self.usegpu = self.cpu_layers == 0 and breakmodel.disk_blocks == 0 and sum(self.layers)-self.layers[0] == 0 self.model_type = self.get_model_type() self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel + else: + self.usegpu = parameters['use_gpu'] if 'use_gpu' in parameters else None self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id'] self.path = parameters['path'] if 'path' in parameters else None diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 5595edc7..c5560360 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -126,6 +126,7 @@ class HFTorchInferenceModel(HFInferenceModel): return "Unknown" def _post_load(m_self) -> None: + if not utils.koboldai_vars.model_type: utils.koboldai_vars.model_type = m_self.get_model_type() @@ -562,6 +563,7 @@ class HFTorchInferenceModel(HFInferenceModel): ) ) # print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) + #logger.debug(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ") model_dict[key] = model_dict[key].materialize( f, map_location="cpu" ) @@ -847,6 +849,7 @@ class HFTorchInferenceModel(HFInferenceModel): # If all layers are on the same device, use the old GPU generation mode while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0: breakmodel.gpu_blocks.pop() + self.breakmodel = True if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in ( -1, utils.num_layers(config), diff --git a/static/custom.css b/static/custom.css index 412c7f1b..968d73e4 100644 --- a/static/custom.css +++ b/static/custom.css @@ -2404,4 +2404,9 @@ body.connected .popupfooter, .popupfooter.always-available { padding: 5px; padding-right: 0px; padding-top: 0px; +} + +.input_error { + border: 5px solid red !important; + box-sizing: border-box !important; } \ No newline at end of file