Improve 4-bit llama support, add 4-bit gptj and gptneox support

This commit is contained in:
0cc4m
2023-03-19 21:19:02 +00:00
parent 5d17692c79
commit 60acf59316

View File

@@ -87,37 +87,14 @@ from io import BytesIO
global tpu_mtj_backend global tpu_mtj_backend
from transformers.models.llama.tokenization_llama import LLaMATokenizer
from repos.gptq.gptq import *
from repos.gptq.modelutils import *
from repos.gptq.quant import *
def load_quant(model, checkpoint, wbits):
from transformers import LLaMAConfig, LLaMAForCausalLM
config = LLaMAConfig.from_pretrained(model)
def noop(*args, **kwargs):
pass
torch.nn.init.kaiming_uniform_ = noop
torch.nn.init.uniform_ = noop
torch.nn.init.normal_ = noop
torch.set_default_dtype(torch.half) # 4-bit dependencies
transformers.modeling_utils._init_weights = False from pathlib import Path
torch.set_default_dtype(torch.half) sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
model = LLaMAForCausalLM(config) from gptj import load_quant as gptj_load_quant
torch.set_default_dtype(torch.float) from gptneox import load_quant as gptneox_load_quant
model = model.eval() from llama import load_quant as llama_load_quant
layers = find_layers(model) vars_4bit = {}
for name in ['lm_head']:
if name in layers:
del layers[name]
make_quant(model, layers, wbits)
print('Loading model ...')
model.load_state_dict(torch.load(checkpoint))
model.seqlen = 2048
print('Done.')
return model
if lupa.LUA_VERSION[:2] != (5, 4): if lupa.LUA_VERSION[:2] != (5, 4):
@@ -1541,6 +1518,11 @@ def general_startup(override_args=None):
parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen") parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen")
parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen") parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen")
# 4-bit stuff
parser.add_argument('--gptj4bit', help="Load a GPT-J model 4-bit pt file with this path")
parser.add_argument('--gptneox4bit', help="Load a GPT-NeoX model 4-bit pt file with this path")
parser.add_argument('--llama4bit', help="Load a Llama model 4-bit pt file with this path")
#args: argparse.Namespace = None #args: argparse.Namespace = None
if "pytest" in sys.modules and override_args is None: if "pytest" in sys.modules and override_args is None:
args = parser.parse_args([]) args = parser.parse_args([])
@@ -1644,6 +1626,11 @@ def general_startup(override_args=None):
koboldai_vars.smanrename = koboldai_vars.host == args.override_rename koboldai_vars.smanrename = koboldai_vars.host == args.override_rename
koboldai_vars.aria2_port = args.aria2_port or 6799 koboldai_vars.aria2_port = args.aria2_port or 6799
global vars_4bit
vars_4bit["gptj4bit"] = args.gptj4bit
vars_4bit["gptneox4bit"] = args.gptneox4bit
vars_4bit["llama4bit"] = args.llama4bit
#Now let's look to see if we are going to force a load of a model from a user selected folder #Now let's look to see if we are going to force a load of a model from a user selected folder
if(koboldai_vars.model == "selectfolder"): if(koboldai_vars.model == "selectfolder"):
@@ -2971,7 +2958,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
try: try:
f = z.open(f"archive/data/{storage_key}") f = z.open(f"archive/data/{storage_key}")
except: except:
f = z.open(f"{zipfolder}/data/{storage_key}") ziproot = z.namelist()[0].split(os.sep)[0]
f = z.open(f"{ziproot}/data/{storage_key}")
current_offset = 0 current_offset = 0
if current_offset != model_dict[key].seek_offset: if current_offset != model_dict[key].seek_offset:
f.read(model_dict[key].seek_offset - current_offset) f.read(model_dict[key].seek_offset - current_offset)
@@ -3117,23 +3105,29 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
lowmem = {} lowmem = {}
if(os.path.isdir(koboldai_vars.custmodpth)): if(os.path.isdir(koboldai_vars.custmodpth)):
tokenizer = LLaMATokenizer.from_pretrained(koboldai_vars.custmodpth) global vars_4bit
# try:
# tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
# except Exception as e:
# try:
# tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
# except Exception as e:
# try:
# tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
# except Exception as e:
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
# model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
if os.environ.get('LLAMA_4BIT'): if vars_4bit.get("gptj4bit"):
model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) model = gptj_load_quant(koboldai_vars.custmodpth, vars_4bit["gptj4bit"], 4)
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
elif vars_4bit.get("gptneox4bit"):
model = gptneox_load_quant(koboldai_vars.custmodpth, vars_4bit["gptneox4bit"], 4)
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
elif vars_4bit.get("llama4bit"):
model = llama_load_quant(koboldai_vars.custmodpth, vars_4bit["llama4bit"], 4)
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth)
else: else:
raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.") try:
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
except Exception as e:
try:
tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
except Exception as e:
try:
tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
except Exception as e:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
if model is None: if model is None:
raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.") raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.")