mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Better use_cache implementation
This commit is contained in:
@@ -107,6 +107,7 @@ class model_backend(HFTorchInferenceModel):
|
||||
|
||||
tf_kwargs = {
|
||||
"low_cpu_mem_usage": True,
|
||||
"use_cache": True # Workaround for models that accidentally turn cache to false
|
||||
}
|
||||
|
||||
if not hasattr(self.model_config, 'quantization_config'):
|
||||
@@ -130,8 +131,8 @@ class model_backend(HFTorchInferenceModel):
|
||||
})
|
||||
|
||||
if self.model_type == "gpt2":
|
||||
# We must disable low_cpu_mem_usage and if using a GPT-2 model
|
||||
# because GPT-2 is not compatible with this feature yet.
|
||||
# We must disable low_cpu_mem_usage and quantization if using a GPT-2 model
|
||||
# because GPT-2 is not compatible with these features yet.
|
||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
||||
tf_kwargs.pop("quantization_config", None)
|
||||
|
||||
|
@@ -230,7 +230,6 @@ class HFInferenceModel(InferenceModel):
|
||||
def _post_load(self) -> None:
|
||||
self.badwordsids = koboldai_settings.badwordsids_default
|
||||
self.model_type = str(self.model_config.model_type)
|
||||
self.model.use_cache = True # Workaround for models that accidentally uploaded with False
|
||||
|
||||
# These are model specific tokenizer overrides if a model has bad defaults
|
||||
if self.model_type == "llama":
|
||||
|
Reference in New Issue
Block a user