mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Preliminary HF GPTQ changes
This commit is contained in:
@@ -57,18 +57,19 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
temp = json.load(f)
|
temp = json.load(f)
|
||||||
else:
|
else:
|
||||||
temp = {}
|
temp = {}
|
||||||
requested_parameters.append({
|
if not hasattr(self.model_config, 'quantization_config'):
|
||||||
"uitype": "dropdown",
|
requested_parameters.append({
|
||||||
"unit": "text",
|
"uitype": "dropdown",
|
||||||
"label": "Quantization",
|
"unit": "text",
|
||||||
"id": "quantization",
|
"label": "Quantization",
|
||||||
"default": temp['quantization'] if 'quantization' in temp else '4bit' if dependency_exists else '16-bit',
|
"id": "quantization",
|
||||||
"tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode",
|
"default": temp['quantization'] if 'quantization' in temp else '4bit' if dependency_exists else '16-bit',
|
||||||
"menu_path": "Layers",
|
"tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode",
|
||||||
"children": [{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}, {'text': '16-bit', 'value':'16-bit'}],
|
"menu_path": "Layers",
|
||||||
"extra_classes": "",
|
"children": [{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}, {'text': '16-bit', 'value':'16-bit'}],
|
||||||
"refresh_model_inputs": False
|
"extra_classes": "",
|
||||||
})
|
"refresh_model_inputs": False
|
||||||
|
})
|
||||||
else:
|
else:
|
||||||
logger.warning("Bitsandbytes is not installed, you can not use Quantization for Huggingface models")
|
logger.warning("Bitsandbytes is not installed, you can not use Quantization for Huggingface models")
|
||||||
return requested_parameters
|
return requested_parameters
|
||||||
@@ -105,24 +106,25 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
"low_cpu_mem_usage": True,
|
"low_cpu_mem_usage": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.quantization == "8bit":
|
if not hasattr(self.model_config, 'quantization_config'):
|
||||||
tf_kwargs.update({
|
if self.quantization == "8bit":
|
||||||
"quantization_config":BitsAndBytesConfig(
|
tf_kwargs.update({
|
||||||
load_in_8bit=True,
|
"quantization_config":BitsAndBytesConfig(
|
||||||
llm_int8_enable_fp32_cpu_offload=True
|
load_in_8bit=True,
|
||||||
),
|
llm_int8_enable_fp32_cpu_offload=True
|
||||||
})
|
),
|
||||||
|
})
|
||||||
|
|
||||||
if self.quantization == "4bit" or utils.koboldai_vars.colab_arg:
|
if self.quantization == "4bit" or utils.koboldai_vars.colab_arg:
|
||||||
tf_kwargs.update({
|
tf_kwargs.update({
|
||||||
"quantization_config":BitsAndBytesConfig(
|
"quantization_config":BitsAndBytesConfig(
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
bnb_4bit_compute_dtype=torch.float16,
|
bnb_4bit_compute_dtype=torch.float16,
|
||||||
bnb_4bit_use_double_quant=True,
|
bnb_4bit_use_double_quant=True,
|
||||||
bnb_4bit_quant_type='nf4',
|
bnb_4bit_quant_type='nf4',
|
||||||
llm_int8_enable_fp32_cpu_offload=True
|
llm_int8_enable_fp32_cpu_offload=True
|
||||||
),
|
),
|
||||||
})
|
})
|
||||||
|
|
||||||
if self.model_type == "gpt2":
|
if self.model_type == "gpt2":
|
||||||
# We must disable low_cpu_mem_usage and if using a GPT-2 model
|
# We must disable low_cpu_mem_usage and if using a GPT-2 model
|
||||||
|
@@ -21,7 +21,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
|
|
||||||
model_backend_type = "GPTQ"
|
model_backend_type = "GPTQ"
|
||||||
model_backend_name = "Huggingface GPTQ"
|
model_backend_name = "Legacy GPTQ"
|
||||||
|
|
||||||
|
|
||||||
def load_model_gptq_settings(path):
|
def load_model_gptq_settings(path):
|
||||||
|
Reference in New Issue
Block a user