Permit CPU layers on 4-bit (Worse than GGML)

2025-06-05 21:59:24 +02:00 · 2023-07-18 21:44:34 +02:00
parent 5f2600d338
commit 22e7baec52
1 changed files with 2 additions and 1 deletions
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -88,7 +88,8 @@ class model_backend(HFTorchInferenceModel):
                    load_in_4bit=True,
                    bnb_4bit_compute_dtype=torch.float16,
                    bnb_4bit_use_double_quant=True,
-                    bnb_4bit_quant_type='nf4'
+                    bnb_4bit_quant_type='nf4',
+                    llm_int8_enable_fp32_cpu_offload=True
                ),
            })