Merge pull request #398 from Alephrin/patch-1

Speeds up bnb 4bit with a custom BitsAndBytesConfig
2025-06-05 21:59:24 +02:00 · 2023-07-17 13:22:44 +02:00
parent 6d7e9e6771 145a43a000
commit f7561044c6
1 changed files with 7 additions and 2 deletions
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -6,7 +6,7 @@ import torch
 import shutil
 from typing import Union

-from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel
+from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig

 import utils
 import modeling.lazy_loader as lazy_loader
@@ -80,7 +80,12 @@ class model_backend(HFTorchInferenceModel):
        if self.use_4_bit:
            self.lazy_load = False
            tf_kwargs.update({
-                "load_in_4bit": True,
+                "quantization_config":BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.float16,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type='nf4'
+                ),
            })

        if self.model_type == "gpt2":