Model: Respect model lazyload over kaivars

kaivars dictates model config unless its from outside aiserver or
whatever.
This commit is contained in:
somebody
2023-03-09 20:29:12 -06:00
parent a472bdf6c3
commit 3646aa9e83
4 changed files with 13 additions and 9 deletions

View File

@@ -58,19 +58,19 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
tf_kwargs.pop("low_cpu_mem_usage", None) tf_kwargs.pop("low_cpu_mem_usage", None)
# Also, lazy loader doesn't support GPT-2 models # Also, lazy loader doesn't support GPT-2 models
utils.koboldai_vars.lazy_load = False self.lazy_load = False
# If we're using torch_lazy_loader, we need to get breakmodel config # If we're using torch_lazy_loader, we need to get breakmodel config
# early so that it knows where to load the individual model tensors # early so that it knows where to load the individual model tensors
if ( if (
utils.koboldai_vars.lazy_load self.lazy_load
and utils.koboldai_vars.hascuda and utils.koboldai_vars.hascuda
and utils.koboldai_vars.breakmodel and utils.koboldai_vars.breakmodel
and not utils.koboldai_vars.nobreakmodel and not utils.koboldai_vars.nobreakmodel
): ):
self.breakmodel_device_config(self.model_config) self.breakmodel_device_config(self.model_config)
if utils.koboldai_vars.lazy_load: if self.lazy_load:
# If we're using lazy loader, we need to figure out what the model's hidden layers are called # If we're using lazy loader, we need to figure out what the model's hidden layers are called
with torch_lazy_loader.use_lazy_torch_load( with torch_lazy_loader.use_lazy_torch_load(
dematerialized_modules=True, use_accelerate_init_empty_weights=True dematerialized_modules=True, use_accelerate_init_empty_weights=True
@@ -78,6 +78,7 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
try: try:
metamodel = AutoModelForCausalLM.from_config(self.model_config) metamodel = AutoModelForCausalLM.from_config(self.model_config)
except Exception as e: except Exception as e:
print("Fell back to neo for metamodel")
metamodel = GPTNeoForCausalLM.from_config(self.model_config) metamodel = GPTNeoForCausalLM.from_config(self.model_config)
utils.layers_module_names = utils.get_layers_module_names(metamodel) utils.layers_module_names = utils.get_layers_module_names(metamodel)
utils.module_names = list(metamodel.state_dict().keys()) utils.module_names = list(metamodel.state_dict().keys())
@@ -85,13 +86,13 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
# Download model from Huggingface if it does not exist, otherwise load locally # Download model from Huggingface if it does not exist, otherwise load locally
with self._maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load( with self._maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(
enable=utils.koboldai_vars.lazy_load, enable=self.lazy_load,
callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) callback=self._get_lazy_load_callback(utils.num_layers(self.model_config))
if utils.koboldai_vars.lazy_load if self.lazy_load
else None, else None,
dematerialized_modules=True, dematerialized_modules=True,
): ):
if utils.koboldai_vars.lazy_load: if self.lazy_load:
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
tf_kwargs.pop("low_cpu_mem_usage", None) tf_kwargs.pop("low_cpu_mem_usage", None)
@@ -248,7 +249,7 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
self.model = self.model.half().to(utils.koboldai_vars.gpu_device) self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
elif utils.koboldai_vars.breakmodel: elif utils.koboldai_vars.breakmodel:
# Use both RAM and VRAM (breakmodel) # Use both RAM and VRAM (breakmodel)
if not utils.koboldai_vars.lazy_load: if not self.lazy_load:
self.breakmodel_device_config(model.config) self.breakmodel_device_config(model.config)
self._move_to_devices() self._move_to_devices()
elif breakmodel.disk_blocks > 0: elif breakmodel.disk_blocks > 0:

View File

@@ -508,10 +508,13 @@ class HFTorchInferenceModel(HFInferenceModel):
**tf_kwargs, **tf_kwargs,
) )
except Exception as e: except Exception as e:
print("Fell back for model due to", e)
if "out of memory" in traceback.format_exc().lower(): if "out of memory" in traceback.format_exc().lower():
raise RuntimeError( raise RuntimeError(
"One of your GPUs ran out of memory when KoboldAI tried to load your model." "One of your GPUs ran out of memory when KoboldAI tried to load your model."
) )
return GPTNeoForCausalLM.from_pretrained( return GPTNeoForCausalLM.from_pretrained(
location, location,
revision=utils.koboldai_vars.revision, revision=utils.koboldai_vars.revision,

View File

@@ -12,7 +12,7 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel
class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel): class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel):
def _load(self, save_model: bool, initial_load: bool) -> None: def _load(self, save_model: bool, initial_load: bool) -> None:
utils.koboldai_vars.lazy_load = False self.lazy_load = False
model_path = None model_path = None

View File

@@ -10,7 +10,7 @@ class PostTokenHooks:
model: InferenceModel, model: InferenceModel,
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
) -> None: ) -> None:
if not model.gen_state["do_streaming"]: if not model.gen_state.get("do_streaming"):
return return
if not utils.koboldai_vars.output_streaming: if not utils.koboldai_vars.output_streaming: