mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Model: Respect model lazyload over kaivars
kaivars dictates model config unless its from outside aiserver or whatever.
This commit is contained in:
@@ -58,19 +58,19 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
|||||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
tf_kwargs.pop("low_cpu_mem_usage", None)
|
||||||
|
|
||||||
# Also, lazy loader doesn't support GPT-2 models
|
# Also, lazy loader doesn't support GPT-2 models
|
||||||
utils.koboldai_vars.lazy_load = False
|
self.lazy_load = False
|
||||||
|
|
||||||
# If we're using torch_lazy_loader, we need to get breakmodel config
|
# If we're using torch_lazy_loader, we need to get breakmodel config
|
||||||
# early so that it knows where to load the individual model tensors
|
# early so that it knows where to load the individual model tensors
|
||||||
if (
|
if (
|
||||||
utils.koboldai_vars.lazy_load
|
self.lazy_load
|
||||||
and utils.koboldai_vars.hascuda
|
and utils.koboldai_vars.hascuda
|
||||||
and utils.koboldai_vars.breakmodel
|
and utils.koboldai_vars.breakmodel
|
||||||
and not utils.koboldai_vars.nobreakmodel
|
and not utils.koboldai_vars.nobreakmodel
|
||||||
):
|
):
|
||||||
self.breakmodel_device_config(self.model_config)
|
self.breakmodel_device_config(self.model_config)
|
||||||
|
|
||||||
if utils.koboldai_vars.lazy_load:
|
if self.lazy_load:
|
||||||
# If we're using lazy loader, we need to figure out what the model's hidden layers are called
|
# If we're using lazy loader, we need to figure out what the model's hidden layers are called
|
||||||
with torch_lazy_loader.use_lazy_torch_load(
|
with torch_lazy_loader.use_lazy_torch_load(
|
||||||
dematerialized_modules=True, use_accelerate_init_empty_weights=True
|
dematerialized_modules=True, use_accelerate_init_empty_weights=True
|
||||||
@@ -78,6 +78,7 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
|||||||
try:
|
try:
|
||||||
metamodel = AutoModelForCausalLM.from_config(self.model_config)
|
metamodel = AutoModelForCausalLM.from_config(self.model_config)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
print("Fell back to neo for metamodel")
|
||||||
metamodel = GPTNeoForCausalLM.from_config(self.model_config)
|
metamodel = GPTNeoForCausalLM.from_config(self.model_config)
|
||||||
utils.layers_module_names = utils.get_layers_module_names(metamodel)
|
utils.layers_module_names = utils.get_layers_module_names(metamodel)
|
||||||
utils.module_names = list(metamodel.state_dict().keys())
|
utils.module_names = list(metamodel.state_dict().keys())
|
||||||
@@ -85,13 +86,13 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
|||||||
|
|
||||||
# Download model from Huggingface if it does not exist, otherwise load locally
|
# Download model from Huggingface if it does not exist, otherwise load locally
|
||||||
with self._maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(
|
with self._maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(
|
||||||
enable=utils.koboldai_vars.lazy_load,
|
enable=self.lazy_load,
|
||||||
callback=self._get_lazy_load_callback(utils.num_layers(self.model_config))
|
callback=self._get_lazy_load_callback(utils.num_layers(self.model_config))
|
||||||
if utils.koboldai_vars.lazy_load
|
if self.lazy_load
|
||||||
else None,
|
else None,
|
||||||
dematerialized_modules=True,
|
dematerialized_modules=True,
|
||||||
):
|
):
|
||||||
if utils.koboldai_vars.lazy_load:
|
if self.lazy_load:
|
||||||
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
||||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
tf_kwargs.pop("low_cpu_mem_usage", None)
|
||||||
|
|
||||||
@@ -248,7 +249,7 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
|||||||
self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
|
self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
|
||||||
elif utils.koboldai_vars.breakmodel:
|
elif utils.koboldai_vars.breakmodel:
|
||||||
# Use both RAM and VRAM (breakmodel)
|
# Use both RAM and VRAM (breakmodel)
|
||||||
if not utils.koboldai_vars.lazy_load:
|
if not self.lazy_load:
|
||||||
self.breakmodel_device_config(model.config)
|
self.breakmodel_device_config(model.config)
|
||||||
self._move_to_devices()
|
self._move_to_devices()
|
||||||
elif breakmodel.disk_blocks > 0:
|
elif breakmodel.disk_blocks > 0:
|
||||||
|
@@ -508,10 +508,13 @@ class HFTorchInferenceModel(HFInferenceModel):
|
|||||||
**tf_kwargs,
|
**tf_kwargs,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
print("Fell back for model due to", e)
|
||||||
|
|
||||||
if "out of memory" in traceback.format_exc().lower():
|
if "out of memory" in traceback.format_exc().lower():
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"One of your GPUs ran out of memory when KoboldAI tried to load your model."
|
"One of your GPUs ran out of memory when KoboldAI tried to load your model."
|
||||||
)
|
)
|
||||||
|
|
||||||
return GPTNeoForCausalLM.from_pretrained(
|
return GPTNeoForCausalLM.from_pretrained(
|
||||||
location,
|
location,
|
||||||
revision=utils.koboldai_vars.revision,
|
revision=utils.koboldai_vars.revision,
|
||||||
|
@@ -12,7 +12,7 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel
|
|||||||
|
|
||||||
class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel):
|
class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel):
|
||||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||||
utils.koboldai_vars.lazy_load = False
|
self.lazy_load = False
|
||||||
|
|
||||||
model_path = None
|
model_path = None
|
||||||
|
|
||||||
|
@@ -10,7 +10,7 @@ class PostTokenHooks:
|
|||||||
model: InferenceModel,
|
model: InferenceModel,
|
||||||
input_ids: torch.LongTensor,
|
input_ids: torch.LongTensor,
|
||||||
) -> None:
|
) -> None:
|
||||||
if not model.gen_state["do_streaming"]:
|
if not model.gen_state.get("do_streaming"):
|
||||||
return
|
return
|
||||||
|
|
||||||
if not utils.koboldai_vars.output_streaming:
|
if not utils.koboldai_vars.output_streaming:
|
||||||
|
Reference in New Issue
Block a user