Model: Respect model lazyload over kaivars

kaivars dictates model config unless its from outside aiserver or whatever.
2025-06-05 21:59:24 +02:00 · 2023-03-09 20:29:12 -06:00
parent a472bdf6c3
commit 3646aa9e83
4 changed files with 13 additions and 9 deletions
--- a/modeling/inference_models/generic_hf_torch.py
+++ b/modeling/inference_models/generic_hf_torch.py
@@ -58,19 +58,19 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
            tf_kwargs.pop("low_cpu_mem_usage", None)

            # Also, lazy loader doesn't support GPT-2 models
-            utils.koboldai_vars.lazy_load = False
+            self.lazy_load = False

        # If we're using torch_lazy_loader, we need to get breakmodel config
        # early so that it knows where to load the individual model tensors
        if (
-            utils.koboldai_vars.lazy_load
+            self.lazy_load
            and utils.koboldai_vars.hascuda
            and utils.koboldai_vars.breakmodel
            and not utils.koboldai_vars.nobreakmodel
        ):
            self.breakmodel_device_config(self.model_config)

-        if utils.koboldai_vars.lazy_load:
+        if self.lazy_load:
            # If we're using lazy loader, we need to figure out what the model's hidden layers are called
            with torch_lazy_loader.use_lazy_torch_load(
                dematerialized_modules=True, use_accelerate_init_empty_weights=True
@@ -78,6 +78,7 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
                try:
                    metamodel = AutoModelForCausalLM.from_config(self.model_config)
                except Exception as e:
+                    print("Fell back to neo for metamodel")
                    metamodel = GPTNeoForCausalLM.from_config(self.model_config)
                utils.layers_module_names = utils.get_layers_module_names(metamodel)
                utils.module_names = list(metamodel.state_dict().keys())
@@ -85,13 +86,13 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):

        # Download model from Huggingface if it does not exist, otherwise load locally
        with self._maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(
-            enable=utils.koboldai_vars.lazy_load,
+            enable=self.lazy_load,
            callback=self._get_lazy_load_callback(utils.num_layers(self.model_config))
-            if utils.koboldai_vars.lazy_load
+            if self.lazy_load
            else None,
            dematerialized_modules=True,
        ):
-            if utils.koboldai_vars.lazy_load:
+            if self.lazy_load:
                # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                tf_kwargs.pop("low_cpu_mem_usage", None)

@@ -248,7 +249,7 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
                self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
            elif utils.koboldai_vars.breakmodel:
                # Use both RAM and VRAM (breakmodel)
-                if not utils.koboldai_vars.lazy_load:
+                if not self.lazy_load:
                    self.breakmodel_device_config(model.config)
                self._move_to_devices()
            elif breakmodel.disk_blocks > 0:
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -508,10 +508,13 @@ class HFTorchInferenceModel(HFInferenceModel):
                **tf_kwargs,
            )
        except Exception as e:
+            print("Fell back for model due to", e)
+
            if "out of memory" in traceback.format_exc().lower():
                raise RuntimeError(
                    "One of your GPUs ran out of memory when KoboldAI tried to load your model."
                )
+
            return GPTNeoForCausalLM.from_pretrained(
                location,
                revision=utils.koboldai_vars.revision,
--- a/modeling/inference_models/legacy_gpt2_hf.py
+++ b/modeling/inference_models/legacy_gpt2_hf.py
@@ -12,7 +12,7 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel

 class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel):
    def _load(self, save_model: bool, initial_load: bool) -> None:
-        utils.koboldai_vars.lazy_load = False
+        self.lazy_load = False

        model_path = None

--- a/modeling/post_token_hooks.py
+++ b/modeling/post_token_hooks.py
@@ -10,7 +10,7 @@ class PostTokenHooks:
        model: InferenceModel,
        input_ids: torch.LongTensor,
    ) -> None:
-        if not model.gen_state["do_streaming"]:
+        if not model.gen_state.get("do_streaming"):
            return

        if not utils.koboldai_vars.output_streaming: