Fix 2, 3 and 8-bit loading

This commit is contained in:
0cc4m
2023-05-19 18:22:26 +02:00
parent 2c18d9f2b5
commit d5eac13d9f

View File

@@ -367,17 +367,17 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
if legacy_groupsize is not False:
groupsize = legacy_groupsize
logger.info(f"Using 4-bit file: {path_4bit}, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}")
logger.info(f"Using GPTQ file: {path_4bit}, {utils.koboldai_vars.gptq_bits}-bit model, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}")
if utils.koboldai_vars.model_type == "gptj":
model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
elif utils.koboldai_vars.model_type == "gpt_neox":
model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
elif utils.koboldai_vars.model_type == "llama":
model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
elif utils.koboldai_vars.model_type == "opt":
model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
elif utils.koboldai_vars.model_type == "mpt":
model = load_quant_offload(mpt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
model = load_quant_offload(mpt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list)
else:
raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit")