From 3646aa9e838717095d111ebd058bcbeb72f4cad8 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 9 Mar 2023 20:29:12 -0600 Subject: [PATCH] Model: Respect model lazyload over kaivars kaivars dictates model config unless its from outside aiserver or whatever. --- modeling/inference_models/generic_hf_torch.py | 15 ++++++++------- modeling/inference_models/hf_torch.py | 3 +++ modeling/inference_models/legacy_gpt2_hf.py | 2 +- modeling/post_token_hooks.py | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py index 4501492f..ce2fedf0 100644 --- a/modeling/inference_models/generic_hf_torch.py +++ b/modeling/inference_models/generic_hf_torch.py @@ -58,19 +58,19 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): tf_kwargs.pop("low_cpu_mem_usage", None) # Also, lazy loader doesn't support GPT-2 models - utils.koboldai_vars.lazy_load = False + self.lazy_load = False # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors if ( - utils.koboldai_vars.lazy_load + self.lazy_load and utils.koboldai_vars.hascuda and utils.koboldai_vars.breakmodel and not utils.koboldai_vars.nobreakmodel ): self.breakmodel_device_config(self.model_config) - if utils.koboldai_vars.lazy_load: + if self.lazy_load: # If we're using lazy loader, we need to figure out what the model's hidden layers are called with torch_lazy_loader.use_lazy_torch_load( dematerialized_modules=True, use_accelerate_init_empty_weights=True @@ -78,6 +78,7 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): try: metamodel = AutoModelForCausalLM.from_config(self.model_config) except Exception as e: + print("Fell back to neo for metamodel") metamodel = GPTNeoForCausalLM.from_config(self.model_config) utils.layers_module_names = utils.get_layers_module_names(metamodel) utils.module_names = list(metamodel.state_dict().keys()) @@ -85,13 +86,13 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): # Download model from Huggingface if it does not exist, otherwise load locally with self._maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load( - enable=utils.koboldai_vars.lazy_load, + enable=self.lazy_load, callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) - if utils.koboldai_vars.lazy_load + if self.lazy_load else None, dematerialized_modules=True, ): - if utils.koboldai_vars.lazy_load: + if self.lazy_load: # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time tf_kwargs.pop("low_cpu_mem_usage", None) @@ -248,7 +249,7 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): self.model = self.model.half().to(utils.koboldai_vars.gpu_device) elif utils.koboldai_vars.breakmodel: # Use both RAM and VRAM (breakmodel) - if not utils.koboldai_vars.lazy_load: + if not self.lazy_load: self.breakmodel_device_config(model.config) self._move_to_devices() elif breakmodel.disk_blocks > 0: diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 1509797d..3ca8b6f2 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -508,10 +508,13 @@ class HFTorchInferenceModel(HFInferenceModel): **tf_kwargs, ) except Exception as e: + print("Fell back for model due to", e) + if "out of memory" in traceback.format_exc().lower(): raise RuntimeError( "One of your GPUs ran out of memory when KoboldAI tried to load your model." ) + return GPTNeoForCausalLM.from_pretrained( location, revision=utils.koboldai_vars.revision, diff --git a/modeling/inference_models/legacy_gpt2_hf.py b/modeling/inference_models/legacy_gpt2_hf.py index 40f5bc8c..b710ac9f 100644 --- a/modeling/inference_models/legacy_gpt2_hf.py +++ b/modeling/inference_models/legacy_gpt2_hf.py @@ -12,7 +12,7 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel): def _load(self, save_model: bool, initial_load: bool) -> None: - utils.koboldai_vars.lazy_load = False + self.lazy_load = False model_path = None diff --git a/modeling/post_token_hooks.py b/modeling/post_token_hooks.py index 750ce137..beb49103 100644 --- a/modeling/post_token_hooks.py +++ b/modeling/post_token_hooks.py @@ -10,7 +10,7 @@ class PostTokenHooks: model: InferenceModel, input_ids: torch.LongTensor, ) -> None: - if not model.gen_state["do_streaming"]: + if not model.gen_state.get("do_streaming"): return if not utils.koboldai_vars.output_streaming: