Better use_cache implementation

2025-06-05 21:59:24 +02:00 · 2023-09-07 04:29:28 +02:00
parent dfb63b2340
commit 0d0a671bb9
2 changed files with 3 additions and 3 deletions
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -107,6 +107,7 @@ class model_backend(HFTorchInferenceModel):

        tf_kwargs = {
            "low_cpu_mem_usage": True,
+            "use_cache": True # Workaround for models that accidentally turn cache to false
        }
        
        if not hasattr(self.model_config, 'quantization_config'):
@@ -130,8 +131,8 @@ class model_backend(HFTorchInferenceModel):
                })

        if self.model_type == "gpt2":
-            # We must disable low_cpu_mem_usage and if using a GPT-2 model
-            # because GPT-2 is not compatible with this feature yet.
+            # We must disable low_cpu_mem_usage and quantization if using a GPT-2 model
+            # because GPT-2 is not compatible with these features yet.
            tf_kwargs.pop("low_cpu_mem_usage", None)
            tf_kwargs.pop("quantization_config", None)
            
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -230,7 +230,6 @@ class HFInferenceModel(InferenceModel):
    def _post_load(self) -> None:
        self.badwordsids = koboldai_settings.badwordsids_default
        self.model_type = str(self.model_config.model_type)
-        self.model.use_cache = True # Workaround for models that accidentally uploaded with False
        
        # These are model specific tokenizer overrides if a model has bad defaults
        if self.model_type == "llama":