Add CPU offloading support for GPT-NeoX, GPT-J and OPT

This commit is contained in:
0cc4m
2023-04-02 18:36:31 +02:00
parent e742083703
commit c8d00b7a10

View File

@@ -3144,6 +3144,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
if offload_4bit: if offload_4bit:
koboldai_vars.lazy_load = False koboldai_vars.lazy_load = False
print("4-bit CPU offloader active")
# If we're using torch_lazy_loader, we need to get breakmodel config # If we're using torch_lazy_loader, we need to get breakmodel config
# early so that it knows where to load the individual model tensors # early so that it knows where to load the individual model tensors
@@ -3176,9 +3177,15 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
if koboldai_vars.model_type == "gptj": if koboldai_vars.model_type == "gptj":
if offload_4bit:
model = load_quant_offload(gptj_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
else:
model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
elif koboldai_vars.model_type == "gpt_neox": elif koboldai_vars.model_type == "gpt_neox":
if offload_4bit:
model = load_quant_offload(gptneox_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
else:
model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
elif koboldai_vars.model_type == "llama": elif koboldai_vars.model_type == "llama":
@@ -3188,6 +3195,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
elif koboldai_vars.model_type == "opt": elif koboldai_vars.model_type == "opt":
if offload_4bit:
model = load_quant_offload(opt_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
else:
model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
else: else: