From e9913d657ab0675673e3801a92ef4f48ce3c23c7 Mon Sep 17 00:00:00 2001 From: Alephrin <84307744+Alephrin@users.noreply.github.com> Date: Mon, 17 Jul 2023 04:43:43 -0600 Subject: [PATCH] Speeds up bnb 4bit with a custom BitsAndBytesConfig With this BitsAndBytesConfig I get about double the speed compared to running without it. (Tested on llama 13B with a 3090) --- modeling/inference_models/generic_hf_torch/class.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 40006dab..b51d8f66 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -6,7 +6,7 @@ import torch import shutil from typing import Union -from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel +from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig import utils import modeling.lazy_loader as lazy_loader @@ -81,6 +81,12 @@ class model_backend(HFTorchInferenceModel): self.lazy_load = False tf_kwargs.update({ "load_in_4bit": True, + "quantization_config":BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4' + ), }) if self.model_type == "gpt2":