mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
33
aiserver.py
33
aiserver.py
@@ -96,6 +96,7 @@ from gptj import load_quant as gptj_load_quant
|
||||
from gptneox import load_quant as gptneox_load_quant
|
||||
from llama import load_quant as llama_load_quant
|
||||
from opt import load_quant as opt_load_quant
|
||||
from offload import load_quant_offload
|
||||
monkey_patched_4bit = False
|
||||
|
||||
|
||||
@@ -3138,6 +3139,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
||||
lowmem = {}
|
||||
koboldai_vars.lazy_load = False # Also, lazy loader doesn't support GPT-2 models
|
||||
|
||||
gpu_layers_list = [int(l) for l in gpu_layers.split(",")]
|
||||
offload_4bit = use_4_bit and sum(gpu_layers_list) < utils.num_layers(model_config)
|
||||
|
||||
if offload_4bit:
|
||||
koboldai_vars.lazy_load = False
|
||||
print("4-bit CPU offloader active")
|
||||
|
||||
# If we're using torch_lazy_loader, we need to get breakmodel config
|
||||
# early so that it knows where to load the individual model tensors
|
||||
if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel:
|
||||
@@ -3169,16 +3177,28 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
||||
|
||||
print(f"Trying to load {koboldai_vars.model_type} model in 4-bit")
|
||||
if koboldai_vars.model_type == "gptj":
|
||||
model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
|
||||
if offload_4bit:
|
||||
model = load_quant_offload(gptj_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
|
||||
else:
|
||||
model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
|
||||
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
|
||||
elif koboldai_vars.model_type == "gpt_neox":
|
||||
model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
|
||||
if offload_4bit:
|
||||
model = load_quant_offload(gptneox_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
|
||||
else:
|
||||
model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
|
||||
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
|
||||
elif koboldai_vars.model_type == "llama":
|
||||
model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
|
||||
if offload_4bit:
|
||||
model = load_quant_offload(llama_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
|
||||
else:
|
||||
model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
|
||||
tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth)
|
||||
elif koboldai_vars.model_type == "opt":
|
||||
model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
|
||||
if offload_4bit:
|
||||
model = load_quant_offload(opt_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list)
|
||||
else:
|
||||
model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize)
|
||||
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
|
||||
else:
|
||||
raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit")
|
||||
@@ -3286,7 +3306,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
||||
patch_causallm(model)
|
||||
|
||||
if(koboldai_vars.hascuda):
|
||||
if(koboldai_vars.usegpu):
|
||||
if offload_4bit:
|
||||
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
||||
generator = model.generate
|
||||
elif(koboldai_vars.usegpu):
|
||||
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
||||
if not use_4_bit:
|
||||
model = model.half().to(koboldai_vars.gpu_device)
|
||||
|
@@ -11,9 +11,6 @@ dependencies:
|
||||
- pytorch=1.11.*
|
||||
- python=3.8.*
|
||||
- cudatoolkit=11.1
|
||||
- cudatoolkit-dev=11.1
|
||||
- gcc=9.*
|
||||
- gxx=9.*
|
||||
- eventlet=0.33.3
|
||||
- dnspython=2.2.1
|
||||
- markdown
|
||||
|
Submodule repos/gptq updated: 954b32183a...971a5785a3
Reference in New Issue
Block a user