diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 93def5a6..0bb954e3 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -88,7 +88,8 @@ class model_backend(HFTorchInferenceModel): load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type='nf4' + bnb_4bit_quant_type='nf4', + llm_int8_enable_fp32_cpu_offload=True ), })