diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index f95bb24a..ea4ff92d 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -107,6 +107,7 @@ class model_backend(HFTorchInferenceModel): tf_kwargs = { "low_cpu_mem_usage": True, + "use_cache": True # Workaround for models that accidentally turn cache to false } if not hasattr(self.model_config, 'quantization_config'): @@ -130,8 +131,8 @@ class model_backend(HFTorchInferenceModel): }) if self.model_type == "gpt2": - # We must disable low_cpu_mem_usage and if using a GPT-2 model - # because GPT-2 is not compatible with this feature yet. + # We must disable low_cpu_mem_usage and quantization if using a GPT-2 model + # because GPT-2 is not compatible with these features yet. tf_kwargs.pop("low_cpu_mem_usage", None) tf_kwargs.pop("quantization_config", None) diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index e3e919b3..7e291b93 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -230,7 +230,6 @@ class HFInferenceModel(InferenceModel): def _post_load(self) -> None: self.badwordsids = koboldai_settings.badwordsids_default self.model_type = str(self.model_config.model_type) - self.model.use_cache = True # Workaround for models that accidentally uploaded with False # These are model specific tokenizer overrides if a model has bad defaults if self.model_type == "llama":