mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Merge remote-tracking branch 'catboxanon/test/4bit' into yr4bit
This commit is contained in:
112
aiserver.py
112
aiserver.py
@@ -87,6 +87,38 @@ from io import BytesIO
|
|||||||
|
|
||||||
global tpu_mtj_backend
|
global tpu_mtj_backend
|
||||||
|
|
||||||
|
from transformers.models.llama.tokenization_llama import LLaMATokenizer
|
||||||
|
from repos.gptq.gptq import *
|
||||||
|
from repos.gptq.modelutils import *
|
||||||
|
from repos.gptq.quant import *
|
||||||
|
def load_quant(model, checkpoint, wbits):
|
||||||
|
from transformers import LLaMAConfig, LLaMAForCausalLM
|
||||||
|
config = LLaMAConfig.from_pretrained(model)
|
||||||
|
def noop(*args, **kwargs):
|
||||||
|
pass
|
||||||
|
torch.nn.init.kaiming_uniform_ = noop
|
||||||
|
torch.nn.init.uniform_ = noop
|
||||||
|
torch.nn.init.normal_ = noop
|
||||||
|
|
||||||
|
torch.set_default_dtype(torch.half)
|
||||||
|
transformers.modeling_utils._init_weights = False
|
||||||
|
torch.set_default_dtype(torch.half)
|
||||||
|
model = LLaMAForCausalLM(config)
|
||||||
|
torch.set_default_dtype(torch.float)
|
||||||
|
model = model.eval()
|
||||||
|
layers = find_layers(model)
|
||||||
|
for name in ['lm_head']:
|
||||||
|
if name in layers:
|
||||||
|
del layers[name]
|
||||||
|
make_quant(model, layers, wbits)
|
||||||
|
|
||||||
|
print('Loading model ...')
|
||||||
|
model.load_state_dict(torch.load(checkpoint))
|
||||||
|
model.seqlen = 2048
|
||||||
|
print('Done.')
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
if lupa.LUA_VERSION[:2] != (5, 4):
|
if lupa.LUA_VERSION[:2] != (5, 4):
|
||||||
logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
|
logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
|
||||||
@@ -1110,9 +1142,9 @@ def move_model_to_devices(model):
|
|||||||
|
|
||||||
if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel):
|
if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel):
|
||||||
if(koboldai_vars.usegpu):
|
if(koboldai_vars.usegpu):
|
||||||
model = model.half().to(koboldai_vars.gpu_device)
|
model = model.to(koboldai_vars.gpu_device)
|
||||||
else:
|
else:
|
||||||
model = model.to('cpu').float()
|
model = model.to('cpu')
|
||||||
generator = model.generate
|
generator = model.generate
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -1140,7 +1172,6 @@ def move_model_to_devices(model):
|
|||||||
generator = model.generate
|
generator = model.generate
|
||||||
return
|
return
|
||||||
|
|
||||||
model.half()
|
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
if(hasattr(model, "transformer")):
|
if(hasattr(model, "transformer")):
|
||||||
@@ -2886,7 +2917,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
|
|
||||||
@functools.lru_cache(maxsize=None)
|
@functools.lru_cache(maxsize=None)
|
||||||
def get_original_key(key):
|
def get_original_key(key):
|
||||||
|
try:
|
||||||
return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
|
return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
|
||||||
|
except ValueError:
|
||||||
|
return key
|
||||||
|
|
||||||
for key, value in model_dict.items():
|
for key, value in model_dict.items():
|
||||||
original_key = get_original_key(key)
|
original_key = get_original_key(key)
|
||||||
@@ -2948,10 +2982,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
|
nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
|
||||||
#print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
|
#print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
|
||||||
model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
|
model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
|
||||||
if model_dict[key].dtype is torch.float32:
|
# if model_dict[key].dtype is torch.float32:
|
||||||
koboldai_vars.fp32_model = True
|
# koboldai_vars.fp32_model = True
|
||||||
if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
|
# if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32:
|
||||||
model_dict[key] = model_dict[key].to(torch.float16)
|
# model_dict[key] = model_dict[key].to(torch.float16)
|
||||||
if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
|
if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16):
|
||||||
model_dict[key] = model_dict[key].to(torch.float32)
|
model_dict[key] = model_dict[key].to(torch.float32)
|
||||||
if device == "shared":
|
if device == "shared":
|
||||||
@@ -2975,16 +3009,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
if utils.offload_index:
|
if utils.offload_index:
|
||||||
for name, tensor in utils.named_buffers:
|
for name, tensor in utils.named_buffers:
|
||||||
dtype = tensor.dtype
|
dtype = tensor.dtype
|
||||||
if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
|
# if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu):
|
||||||
dtype = torch.float16
|
# dtype = torch.float16
|
||||||
if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
|
# if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel):
|
||||||
dtype = torch.float32
|
# dtype = torch.float32
|
||||||
if name in model_dict and model_dict[name].dtype is not dtype:
|
# if name in model_dict and model_dict[name].dtype is not dtype:
|
||||||
model_dict[name] = model_dict[name].to(dtype)
|
# model_dict[name] = model_dict[name].to(dtype)
|
||||||
if tensor.dtype is not dtype:
|
# if tensor.dtype is not dtype:
|
||||||
tensor = tensor.to(dtype)
|
# tensor = tensor.to(dtype)
|
||||||
if name not in utils.offload_index:
|
# if name not in utils.offload_index:
|
||||||
accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
|
# accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
|
||||||
accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
|
accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
|
||||||
utils.bar.close()
|
utils.bar.close()
|
||||||
utils.bar = None
|
utils.bar = None
|
||||||
@@ -3043,10 +3077,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
||||||
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
# Is CUDA available? If so, use GPU, otherwise fall back to CPU
|
||||||
if(koboldai_vars.hascuda and koboldai_vars.usegpu):
|
if(koboldai_vars.hascuda and koboldai_vars.usegpu):
|
||||||
model = model.half().to(koboldai_vars.gpu_device)
|
model = model.to(koboldai_vars.gpu_device)
|
||||||
generator = model.generate
|
generator = model.generate
|
||||||
else:
|
else:
|
||||||
model = model.to('cpu').float()
|
model = model.to('cpu')
|
||||||
generator = model.generate
|
generator = model.generate
|
||||||
patch_causallm(model)
|
patch_causallm(model)
|
||||||
# Use the Generic implementation
|
# Use the Generic implementation
|
||||||
@@ -3083,22 +3117,31 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
||||||
lowmem = {}
|
lowmem = {}
|
||||||
if(os.path.isdir(koboldai_vars.custmodpth)):
|
if(os.path.isdir(koboldai_vars.custmodpth)):
|
||||||
|
tokenizer = LLaMATokenizer.from_pretrained(koboldai_vars.custmodpth)
|
||||||
|
# try:
|
||||||
|
# tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
|
||||||
|
# except Exception as e:
|
||||||
|
# try:
|
||||||
|
# tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
|
||||||
|
# except Exception as e:
|
||||||
|
# try:
|
||||||
|
# tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
|
||||||
|
# except Exception as e:
|
||||||
|
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
|
||||||
try:
|
try:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
|
# model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||||
except Exception as e:
|
if os.environ.get('LLAMA_4BIT') is not None:
|
||||||
try:
|
model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
|
else:
|
||||||
except Exception as e:
|
raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.")
|
||||||
try:
|
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
|
if model is None:
|
||||||
except Exception as e:
|
raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
|
|
||||||
try:
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if("out of memory" in traceback.format_exc().lower()):
|
if("out of memory" in traceback.format_exc().lower()):
|
||||||
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
||||||
model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
# model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||||
elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
|
elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
|
||||||
try:
|
try:
|
||||||
tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
|
tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
|
||||||
@@ -3153,7 +3196,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
import shutil
|
import shutil
|
||||||
tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
|
tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
|
||||||
if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)): # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case
|
if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)): # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case
|
||||||
model = model.half()
|
|
||||||
model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
|
model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
|
||||||
else: # For fp16 models, we can just copy the model files directly
|
else: # For fp16 models, we can just copy the model files directly
|
||||||
import transformers.configuration_utils
|
import transformers.configuration_utils
|
||||||
@@ -3187,7 +3229,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
if(koboldai_vars.hascuda):
|
if(koboldai_vars.hascuda):
|
||||||
if(koboldai_vars.usegpu):
|
if(koboldai_vars.usegpu):
|
||||||
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
||||||
model = model.half().to(koboldai_vars.gpu_device)
|
model = model.to(koboldai_vars.gpu_device)
|
||||||
generator = model.generate
|
generator = model.generate
|
||||||
elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel)
|
elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel)
|
||||||
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
||||||
@@ -3199,7 +3241,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
||||||
generator = model.generate
|
generator = model.generate
|
||||||
else:
|
else:
|
||||||
model = model.to('cpu').float()
|
model = model.to('cpu')
|
||||||
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
||||||
generator = model.generate
|
generator = model.generate
|
||||||
elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
|
elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
|
||||||
@@ -3207,7 +3249,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
||||||
generator = model.generate
|
generator = model.generate
|
||||||
else:
|
else:
|
||||||
model.to('cpu').float()
|
model.to('cpu')
|
||||||
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
koboldai_vars.modeldim = get_hidden_size_from_model(model)
|
||||||
generator = model.generate
|
generator = model.generate
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user