Speeds up bnb 4bit with a custom BitsAndBytesConfig

With this BitsAndBytesConfig I get about double the speed compared to running without it. (Tested on llama 13B with a 3090)
This commit is contained in:
Alephrin
2023-07-17 04:43:43 -06:00
committed by GitHub
parent 6d7e9e6771
commit e9913d657a

View File

@@ -6,7 +6,7 @@ import torch
import shutil import shutil
from typing import Union from typing import Union
from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
import utils import utils
import modeling.lazy_loader as lazy_loader import modeling.lazy_loader as lazy_loader
@@ -81,6 +81,12 @@ class model_backend(HFTorchInferenceModel):
self.lazy_load = False self.lazy_load = False
tf_kwargs.update({ tf_kwargs.update({
"load_in_4bit": True, "load_in_4bit": True,
"quantization_config":BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4'
),
}) })
if self.model_type == "gpt2": if self.model_type == "gpt2":