From 20a5587d660f651f108762ec99faf357a678285d Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 30 Apr 2023 18:17:43 +0200 Subject: [PATCH] Always use offloader script, because it speeds up multi gpu --- modeling/inference_models/hf_torch_4bit.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index be504d4f..98c9d785 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -333,25 +333,13 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit") if utils.koboldai_vars.model_type == "gptj": - if self.offload_4bit: - model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) - else: - model = gptj_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "gpt_neox": - if self.offload_4bit: - model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) - else: - model = gptneox_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "llama": - if self.offload_4bit: - model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) - else: - model = llama_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "opt": - if self.offload_4bit: - model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) - else: - model = opt_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) else: raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit")