From 8c9ed5540626655870b6c8e79b5a838f6f012a91 Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Thu, 9 Mar 2023 22:36:45 -0500 Subject: [PATCH 001/113] Update aiserver.py --- aiserver.py | 63 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/aiserver.py b/aiserver.py index 7c60b04e..4174d1fa 100644 --- a/aiserver.py +++ b/aiserver.py @@ -87,6 +87,38 @@ from io import BytesIO global tpu_mtj_backend +from transformers.models.llama.tokenization_llama import LLaMATokenizer +from repos.gptq.gptq import * +from repos.gptq.modelutils import * +from repos.gptq.quant import * +def load_quant(model, checkpoint, wbits): + from transformers import LLaMAConfig, LLaMAForCausalLM + config = LLaMAConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = LLaMAForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['lm_head']: + if name in layers: + del layers[name] + make_quant(model, layers, wbits) + + print('Loading model ...') + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + print('Done.') + + return model + if lupa.LUA_VERSION[:2] != (5, 4): logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.") @@ -2886,7 +2918,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal @functools.lru_cache(maxsize=None) def get_original_key(key): - return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) + try: + return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) + except ValueError: + return key for key, value in model_dict.items(): original_key = get_original_key(key) @@ -3083,22 +3118,24 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time lowmem = {} if(os.path.isdir(koboldai_vars.custmodpth)): + tokenizer = LLaMATokenizer.from_pretrained(koboldai_vars.custmodpth) + # try: + # tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) + # except Exception as e: + # try: + # tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") + # except Exception as e: + # try: + # tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") + # except Exception as e: + # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) - except Exception as e: - try: - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - except Exception as e: - try: - tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - except Exception as e: - tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") - try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + # model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) From dcf9d37a00dc582618f10deef6d226f77018dc16 Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Fri, 10 Mar 2023 00:01:40 -0500 Subject: [PATCH 002/113] It just works. --- aiserver.py | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/aiserver.py b/aiserver.py index 4174d1fa..66aa7362 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1142,9 +1142,9 @@ def move_model_to_devices(model): if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel): if(koboldai_vars.usegpu): - model = model.half().to(koboldai_vars.gpu_device) + model = model.to(koboldai_vars.gpu_device) else: - model = model.to('cpu').float() + model = model.to('cpu') generator = model.generate return @@ -1172,7 +1172,6 @@ def move_model_to_devices(model): generator = model.generate return - model.half() gc.collect() if(hasattr(model, "transformer")): @@ -2983,10 +2982,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3) #print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) model_dict[key] = model_dict[key].materialize(f, map_location="cpu") - if model_dict[key].dtype is torch.float32: - koboldai_vars.fp32_model = True - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: - model_dict[key] = model_dict[key].to(torch.float16) + # if model_dict[key].dtype is torch.float32: + # koboldai_vars.fp32_model = True + # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: + # model_dict[key] = model_dict[key].to(torch.float16) if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16): model_dict[key] = model_dict[key].to(torch.float32) if device == "shared": @@ -3010,16 +3009,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if utils.offload_index: for name, tensor in utils.named_buffers: dtype = tensor.dtype - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): - dtype = torch.float16 - if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): - dtype = torch.float32 - if name in model_dict and model_dict[name].dtype is not dtype: - model_dict[name] = model_dict[name].to(dtype) - if tensor.dtype is not dtype: - tensor = tensor.to(dtype) - if name not in utils.offload_index: - accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) + # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): + # dtype = torch.float16 + # if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): + # dtype = torch.float32 + # if name in model_dict and model_dict[name].dtype is not dtype: + # model_dict[name] = model_dict[name].to(dtype) + # if tensor.dtype is not dtype: + # tensor = tensor.to(dtype) + # if name not in utils.offload_index: + # accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache") utils.bar.close() utils.bar = None @@ -3078,10 +3077,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.modeldim = get_hidden_size_from_model(model) # Is CUDA available? If so, use GPU, otherwise fall back to CPU if(koboldai_vars.hascuda and koboldai_vars.usegpu): - model = model.half().to(koboldai_vars.gpu_device) + model = model.to(koboldai_vars.gpu_device) generator = model.generate else: - model = model.to('cpu').float() + model = model.to('cpu') generator = model.generate patch_causallm(model) # Use the Generic implementation @@ -3131,7 +3130,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4) + model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") @@ -3190,7 +3189,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal import shutil tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)): # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case - model = model.half() model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB") else: # For fp16 models, we can just copy the model files directly import transformers.configuration_utils @@ -3224,7 +3222,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.hascuda): if(koboldai_vars.usegpu): koboldai_vars.modeldim = get_hidden_size_from_model(model) - model = model.half().to(koboldai_vars.gpu_device) + model = model.to(koboldai_vars.gpu_device) generator = model.generate elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) koboldai_vars.modeldim = get_hidden_size_from_model(model) @@ -3236,7 +3234,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate else: - model = model.to('cpu').float() + model = model.to('cpu') koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): @@ -3244,7 +3242,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate else: - model.to('cpu').float() + model.to('cpu') koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate From 3f132ce45ba61f30015147bb0d9ba26647204332 Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Fri, 10 Mar 2023 03:26:09 -0500 Subject: [PATCH 003/113] Notify if LLAMA_4BIT env var not set --- aiserver.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 66aa7362..399ce434 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3130,7 +3130,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) + if os.environ.get('LLAMA_4BIT') is not None: + model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) + else: + raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before procedding.") + exit(1) + except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") From 1808b0d2eca42e30bee6edd6896744cfd6995ffc Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Sat, 11 Mar 2023 12:13:22 -0500 Subject: [PATCH 004/113] Another safety check for if model is not loaded --- aiserver.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 399ce434..3ec8f284 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3133,13 +3133,17 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if os.environ.get('LLAMA_4BIT') is not None: model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) else: - raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before procedding.") + raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.") exit(1) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") # model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + + if model is None: + raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load. Exiting.") + exit(1) elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) From bde31217f164a3aadc4282913012378a886d6058 Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Sat, 11 Mar 2023 12:15:58 -0500 Subject: [PATCH 005/113] improve model None check --- aiserver.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/aiserver.py b/aiserver.py index 3ec8f284..c14ac730 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3134,16 +3134,14 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) else: raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.") - exit(1) + + if model is None: + raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.") except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") # model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - - if model is None: - raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load. Exiting.") - exit(1) elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) From b3b454bbe4b4a479ec5703b99487bf00906975ac Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed, 15 Mar 2023 00:03:43 -0500 Subject: [PATCH 006/113] Update huggingface.yml --- environments/huggingface.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 222bb6ad..26e7e670 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -29,7 +29,8 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers==4.25.1 + - git+https://github.com/zphang/transformers@llama_push + - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc - huggingface_hub==0.12.1 - safetensors - accelerate From 5d17692c79a3642b7e1ae1c37e262cd47f449356 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 16 Mar 2023 05:19:47 +0000 Subject: [PATCH 007/113] Remove except Exception so that errors actually show up --- aiserver.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/aiserver.py b/aiserver.py index 77e31b63..40d9a4ba 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3128,20 +3128,15 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") # except Exception as e: # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") - try: - # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - if os.environ.get('LLAMA_4BIT') is not None: - model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) - else: - raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.") + # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - if model is None: - raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.") + if os.environ.get('LLAMA_4BIT'): + model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) + else: + raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.") - except Exception as e: - if("out of memory" in traceback.format_exc().lower()): - raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - # model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + if model is None: + raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.") elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) From 60acf593160ce86118286ab0fa5c4ce082ddc52c Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 19 Mar 2023 21:19:02 +0000 Subject: [PATCH 008/113] Improve 4-bit llama support, add 4-bit gptj and gptneox support --- aiserver.py | 86 +++++++++++++++++++++++++---------------------------- 1 file changed, 40 insertions(+), 46 deletions(-) diff --git a/aiserver.py b/aiserver.py index 40d9a4ba..96ea7490 100644 --- a/aiserver.py +++ b/aiserver.py @@ -87,37 +87,14 @@ from io import BytesIO global tpu_mtj_backend -from transformers.models.llama.tokenization_llama import LLaMATokenizer -from repos.gptq.gptq import * -from repos.gptq.modelutils import * -from repos.gptq.quant import * -def load_quant(model, checkpoint, wbits): - from transformers import LLaMAConfig, LLaMAForCausalLM - config = LLaMAConfig.from_pretrained(model) - def noop(*args, **kwargs): - pass - torch.nn.init.kaiming_uniform_ = noop - torch.nn.init.uniform_ = noop - torch.nn.init.normal_ = noop - torch.set_default_dtype(torch.half) - transformers.modeling_utils._init_weights = False - torch.set_default_dtype(torch.half) - model = LLaMAForCausalLM(config) - torch.set_default_dtype(torch.float) - model = model.eval() - layers = find_layers(model) - for name in ['lm_head']: - if name in layers: - del layers[name] - make_quant(model, layers, wbits) - - print('Loading model ...') - model.load_state_dict(torch.load(checkpoint)) - model.seqlen = 2048 - print('Done.') - - return model +# 4-bit dependencies +from pathlib import Path +sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) +from gptj import load_quant as gptj_load_quant +from gptneox import load_quant as gptneox_load_quant +from llama import load_quant as llama_load_quant +vars_4bit = {} if lupa.LUA_VERSION[:2] != (5, 4): @@ -1541,6 +1518,11 @@ def general_startup(override_args=None): parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen") parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen") + # 4-bit stuff + parser.add_argument('--gptj4bit', help="Load a GPT-J model 4-bit pt file with this path") + parser.add_argument('--gptneox4bit', help="Load a GPT-NeoX model 4-bit pt file with this path") + parser.add_argument('--llama4bit', help="Load a Llama model 4-bit pt file with this path") + #args: argparse.Namespace = None if "pytest" in sys.modules and override_args is None: args = parser.parse_args([]) @@ -1644,6 +1626,11 @@ def general_startup(override_args=None): koboldai_vars.smanrename = koboldai_vars.host == args.override_rename koboldai_vars.aria2_port = args.aria2_port or 6799 + + global vars_4bit + vars_4bit["gptj4bit"] = args.gptj4bit + vars_4bit["gptneox4bit"] = args.gptneox4bit + vars_4bit["llama4bit"] = args.llama4bit #Now let's look to see if we are going to force a load of a model from a user selected folder if(koboldai_vars.model == "selectfolder"): @@ -2971,7 +2958,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal try: f = z.open(f"archive/data/{storage_key}") except: - f = z.open(f"{zipfolder}/data/{storage_key}") + ziproot = z.namelist()[0].split(os.sep)[0] + f = z.open(f"{ziproot}/data/{storage_key}") current_offset = 0 if current_offset != model_dict[key].seek_offset: f.read(model_dict[key].seek_offset - current_offset) @@ -3117,23 +3105,29 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time lowmem = {} if(os.path.isdir(koboldai_vars.custmodpth)): - tokenizer = LLaMATokenizer.from_pretrained(koboldai_vars.custmodpth) - # try: - # tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) - # except Exception as e: - # try: - # tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - # except Exception as e: - # try: - # tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - # except Exception as e: - # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") - # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + global vars_4bit - if os.environ.get('LLAMA_4BIT'): - model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) + if vars_4bit.get("gptj4bit"): + model = gptj_load_quant(koboldai_vars.custmodpth, vars_4bit["gptj4bit"], 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) + elif vars_4bit.get("gptneox4bit"): + model = gptneox_load_quant(koboldai_vars.custmodpth, vars_4bit["gptneox4bit"], 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) + elif vars_4bit.get("llama4bit"): + model = llama_load_quant(koboldai_vars.custmodpth, vars_4bit["llama4bit"], 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) else: - raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.") + try: + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) + except Exception as e: + try: + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") + except Exception as e: + try: + tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") + except Exception as e: + tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) if model is None: raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.") From 858657f6691933ad3660660001837491b7ba4ae6 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 20 Mar 2023 09:16:30 +0100 Subject: [PATCH 009/113] Fix zipfile folder identification fix for Windows --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 96ea7490..4558ce3d 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2958,7 +2958,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal try: f = z.open(f"archive/data/{storage_key}") except: - ziproot = z.namelist()[0].split(os.sep)[0] + ziproot = z.namelist()[0].split("/")[0] f = z.open(f"{ziproot}/data/{storage_key}") current_offset = 0 if current_offset != model_dict[key].seek_offset: From 4cfc1219d449ebc92205eed15f0ffc1b133db708 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 20 Mar 2023 19:13:46 +0000 Subject: [PATCH 010/113] Add gptq as submodule --- .gitmodules | 4 ++++ repos/gptq | 1 + 2 files changed, 5 insertions(+) create mode 160000 repos/gptq diff --git a/.gitmodules b/.gitmodules index 0107a8c3..c6f4b308 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,7 @@ [submodule "KoboldAI-Horde-Bridge"] path = KoboldAI-Horde-Bridge url = https://github.com/db0/KoboldAI-Horde-Bridge +[submodule "repos/gptq"] + path = repos/gptq + url = https://github.com/0cc4m/GPTQ-for-LLaMa + branch = a8303654c200c25577130466e5f9bc1e70fc8a50 diff --git a/repos/gptq b/repos/gptq new file mode 160000 index 00000000..a8303654 --- /dev/null +++ b/repos/gptq @@ -0,0 +1 @@ +Subproject commit a8303654c200c25577130466e5f9bc1e70fc8a50 From ecd065a881d40996558ff07d0e2bfdbdf255e777 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 21 Mar 2023 21:40:59 +0000 Subject: [PATCH 011/113] Overhaul 4-bit support to load with a toggle --- aiserver.py | 145 +++++++++++++++++++++++++++--------------- koboldai_settings.py | 6 +- static/koboldai.js | 32 +++++++++- templates/popups.html | 6 +- 4 files changed, 130 insertions(+), 59 deletions(-) diff --git a/aiserver.py b/aiserver.py index f58d949a..7497dfb9 100644 --- a/aiserver.py +++ b/aiserver.py @@ -70,7 +70,7 @@ from utils import debounce import utils import koboldai_settings import torch -from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, modeling_utils, AutoModelForTokenClassification +from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, modeling_utils, AutoModelForTokenClassification, LlamaTokenizer from transformers import __version__ as transformers_version import transformers try: @@ -1114,14 +1114,20 @@ def device_config(config): koboldai_vars.usegpu = False return -def move_model_to_devices(model): +def move_model_to_devices(model, use_4_bit=False): global generator if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel): if(koboldai_vars.usegpu): - model = model.to(koboldai_vars.gpu_device) + if not use_4_bit: + model = model.half().to(koboldai_vars.gpu_device) + else: + model = model.to(koboldai_vars.gpu_device) else: - model = model.to('cpu') + if not use_4_bit: + model = model.to('cpu').float() + else: + model = model.to('cpu') generator = model.generate return @@ -1149,6 +1155,8 @@ def move_model_to_devices(model): generator = model.generate return + if not use_4_bit: + model.half() gc.collect() if(hasattr(model, "transformer")): @@ -1518,11 +1526,6 @@ def general_startup(override_args=None): parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen") parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen") - # 4-bit stuff - parser.add_argument('--gptj4bit', help="Load a GPT-J model 4-bit pt file with this path") - parser.add_argument('--gptneox4bit', help="Load a GPT-NeoX model 4-bit pt file with this path") - parser.add_argument('--llama4bit', help="Load a Llama model 4-bit pt file with this path") - #args: argparse.Namespace = None if "pytest" in sys.modules and override_args is None: args = parser.parse_args([]) @@ -1626,11 +1629,6 @@ def general_startup(override_args=None): koboldai_vars.smanrename = koboldai_vars.host == args.override_rename koboldai_vars.aria2_port = args.aria2_port or 6799 - - global vars_4bit - vars_4bit["gptj4bit"] = args.gptj4bit - vars_4bit["gptneox4bit"] = args.gptneox4bit - vars_4bit["llama4bit"] = args.llama4bit #Now let's look to see if we are going to force a load of a model from a user selected folder if(koboldai_vars.model == "selectfolder"): @@ -1777,6 +1775,7 @@ def get_model_info(model, directory=""): 'break_values': break_values, 'gpu_count': gpu_count, 'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select, 'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False, + 'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False, 'show_custom_model_box': show_custom_model_box}) if send_horde_models: get_cluster_models({'key': key_value, 'url': default_url}) @@ -1918,6 +1917,18 @@ def get_cluster_models(msg): emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2") +@socketio.on("use_4_bit_toggle") +def use_4_bit_toggle(msg): + # Disable lazy_load and breakmodel + if msg["use_4_bit"]: + koboldai_vars.lazy_load = False + koboldai_vars.nobreakmodel = True + else: + koboldai_vars.lazy_load = True + koboldai_vars.nobreakmodel = False + + # TODO: Reload JS values for this stuff + # Function to patch transformers to use our soft prompt def patch_causallm(model): from torch.nn import Embedding @@ -2647,7 +2658,7 @@ def unload_model(): koboldai_vars.badwordsids = koboldai_settings.badwordsids_default -def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False): +def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False): global model global generator global torch @@ -2684,7 +2695,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal disk_layers = args.breakmodel_disklayers if breakmodel_args_default_to_cpu and disk_layers is None: disk_layers = args.breakmodel_disklayers = 0 - + unload_model() if online_model == "": @@ -2904,10 +2915,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal @functools.lru_cache(maxsize=None) def get_original_key(key): - try: - return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) - except ValueError: - return key + # try: + return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) + # except ValueError: + # return key for key, value in model_dict.items(): original_key = get_original_key(key) @@ -2970,10 +2981,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3) #print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) model_dict[key] = model_dict[key].materialize(f, map_location="cpu") - # if model_dict[key].dtype is torch.float32: - # koboldai_vars.fp32_model = True - # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: - # model_dict[key] = model_dict[key].to(torch.float16) + if not use_4_bit: + if model_dict[key].dtype is torch.float32: + koboldai_vars.fp32_model = True + if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: + model_dict[key] = model_dict[key].to(torch.float16) if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16): model_dict[key] = model_dict[key].to(torch.float32) if device == "shared": @@ -2997,16 +3009,17 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if utils.offload_index: for name, tensor in utils.named_buffers: dtype = tensor.dtype - # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): - # dtype = torch.float16 - # if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): - # dtype = torch.float32 - # if name in model_dict and model_dict[name].dtype is not dtype: - # model_dict[name] = model_dict[name].to(dtype) - # if tensor.dtype is not dtype: - # tensor = tensor.to(dtype) - # if name not in utils.offload_index: - # accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) + if not use_4_bit: + if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): + dtype = torch.float16 + if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): + dtype = torch.float32 + if name in model_dict and model_dict[name].dtype is not dtype: + model_dict[name] = model_dict[name].to(dtype) + if tensor.dtype is not dtype: + tensor = tensor.to(dtype) + if name not in utils.offload_index: + accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache") utils.bar.close() utils.bar = None @@ -3065,10 +3078,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.modeldim = get_hidden_size_from_model(model) # Is CUDA available? If so, use GPU, otherwise fall back to CPU if(koboldai_vars.hascuda and koboldai_vars.usegpu): - model = model.to(koboldai_vars.gpu_device) + if not use_4_bit: + model = model.half().to(koboldai_vars.gpu_device) + else: + model = model.to(koboldai_vars.gpu_device) generator = model.generate else: - model = model.to('cpu') + if not use_4_bit: + model = model.to('cpu').float() + else: + model = model.to('cpu') generator = model.generate patch_causallm(model) # Use the Generic implementation @@ -3105,17 +3124,26 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time lowmem = {} if(os.path.isdir(koboldai_vars.custmodpth)): - global vars_4bit - if vars_4bit.get("gptj4bit"): - model = gptj_load_quant(koboldai_vars.custmodpth, vars_4bit["gptj4bit"], 4) - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) - elif vars_4bit.get("gptneox4bit"): - model = gptneox_load_quant(koboldai_vars.custmodpth, vars_4bit["gptneox4bit"], 4) - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) - elif vars_4bit.get("llama4bit"): - model = llama_load_quant(koboldai_vars.custmodpth, vars_4bit["llama4bit"], 4) - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) + path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") + + if not os.path.isfile(path_4bit): + print(f"4-bit file {path_4bit} not found, aborting 4-bit load") + use_4_bit = False + + if use_4_bit: + print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") + if koboldai_vars.model_type == "gptj": + model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) + elif koboldai_vars.model_type == "gpt_neox": + model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) + elif koboldai_vars.model_type == "llama": + model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) + else: + raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") else: try: tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) @@ -3185,6 +3213,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal import shutil tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)): # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case + if not use_4_bit: + model = model.half() model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB") else: # For fp16 models, we can just copy the model files directly import transformers.configuration_utils @@ -3218,27 +3248,36 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.hascuda): if(koboldai_vars.usegpu): koboldai_vars.modeldim = get_hidden_size_from_model(model) - model = model.to(koboldai_vars.gpu_device) + if not use_4_bit: + model = model.half().to(koboldai_vars.gpu_device) + else: + model = model.to(koboldai_vars.gpu_device) generator = model.generate elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) koboldai_vars.modeldim = get_hidden_size_from_model(model) if(not koboldai_vars.lazy_load): device_config(model.config) - move_model_to_devices(model) + move_model_to_devices(model, use_4_bit) elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): - move_model_to_devices(model) + move_model_to_devices(model, use_4_bit) koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate else: - model = model.to('cpu') + if not use_4_bit: + model.to('cpu').float() + else: + model.to('cpu') koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): - move_model_to_devices(model) + move_model_to_devices(model, use_4_bit) koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate else: - model.to('cpu') + if not use_4_bit: + model.to('cpu').float() + else: + model.to('cpu') koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate @@ -8784,7 +8823,7 @@ def UI_2_load_model(data): koboldai_vars.model = data['model'] koboldai_vars.custmodpth = data['path'] print("loading Model") - load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit']) + load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'], use_4_bit=data['use_4_bit']) #==================================================================# # Event triggered when load story is clicked diff --git a/koboldai_settings.py b/koboldai_settings.py index 95caec0c..16cc8128 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1207,7 +1207,7 @@ class system_settings(settings): 'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model', - 'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states'] + 'generating_image', 'bit_8_available', 'bit_4_available', 'host', 'hascuda', 'usegpu', 'rng_states'] settings_name = "system" def __init__(self, socketio, koboldai_var): self._socketio = socketio @@ -1302,6 +1302,8 @@ class system_settings(settings): elif torch.cuda.get_device_properties(device).major == 7 and torch.cuda.get_device_properties(device).minor >= 2: self.bit_8_available = True break + # Check if repos/gptq exists for 4-bit mode + self.bit_4_available = os.path.isdir("repos/gptq") self.seen_messages = [] @@ -2744,4 +2746,4 @@ default_preset = { ] } badwordsids_default = [[6880], [50256], [42496], [4613], [17414], [22039], [16410], [27], [29], [38430], [37922], [15913], [24618], [28725], [58], [47175], [36937], [26700], [12878], [16471], [37981], [5218], [29795], [13412], [45160], [3693], [49778], [4211], [20598], [36475], [33409], [44167], [32406], [29847], [29342], [42669], [685], [25787], [7359], [3784], [5320], [33994], [33490], [34516], [43734], [17635], [24293], [9959], [23785], [21737], [28401], [18161], [26358], [32509], [1279], [38155], [18189], [26894], [6927], [14610], [23834], [11037], [14631], [26933], [46904], [22330], [25915], [47934], [38214], [1875], [14692], [41832], [13163], [25970], [29565], [44926], [19841], [37250], [49029], [9609], [44438], [16791], [17816], [30109], [41888], [47527], [42924], [23984], [49074], [33717], [31161], [49082], [30138], [31175], [12240], [14804], [7131], [26076], [33250], [3556], [38381], [36338], [32756], [46581], [17912], [49146]] # Tokenized array of badwords used to prevent AI artifacting -badwordsids_neox = [[0], [1], [44162], [9502], [12520], [31841], [36320], [49824], [34417], [6038], [34494], [24815], [26635], [24345], [3455], [28905], [44270], [17278], [32666], [46880], [7086], [43189], [37322], [17778], [20879], [49821], [3138], [14490], [4681], [21391], [26786], [43134], [9336], [683], [48074], [41256], [19181], [29650], [28532], [36487], [45114], [46275], [16445], [15104], [11337], [1168], [5647], [29], [27482], [44965], [43782], [31011], [42944], [47389], [6334], [17548], [38329], [32044], [35487], [2239], [34761], [7444], [1084], [12399], [18990], [17636], [39083], [1184], [35830], [28365], [16731], [43467], [47744], [1138], [16079], [40116], [45564], [18297], [42368], [5456], [18022], [42696], [34476], [23505], [23741], [39334], [37944], [45382], [38709], [33440], [26077], [43600], [34418], [36033], [6660], [48167], [48471], [15775], [19884], [41533], [1008], [31053], [36692], [46576], [20095], [20629], [31759], [46410], [41000], [13488], [30952], [39258], [16160], [27655], [22367], [42767], [43736], [49694], [13811], [12004], [46768], [6257], [37471], [5264], [44153], [33805], [20977], [21083], [25416], [14277], [31096], [42041], [18331], [33376], [22372], [46294], [28379], [38475], [1656], [5204], [27075], [50001], [16616], [11396], [7748], [48744], [35402], [28120], [41512], [4207], [43144], [14767], [15640], [16595], [41305], [44479], [38958], [18474], [22734], [30522], [46267], [60], [13976], [31830], [48701], [39822], [9014], [21966], [31422], [28052], [34607], [2479], [3851], [32214], [44082], [45507], [3001], [34368], [34758], [13380], [38363], [4299], [46802], [30996], [12630], [49236], [7082], [8795], [5218], [44740], [9686], [9983], [45301], [27114], [40125], [1570], [26997], [544], [5290], [49193], [23781], [14193], [40000], [2947], [43781], [9102], [48064], [42274], [18772], [49384], [9884], [45635], [43521], [31258], [32056], [47686], [21760], [13143], [10148], [26119], [44308], [31379], [36399], [23983], [46694], [36134], [8562], [12977], [35117], [28591], [49021], [47093], [28653], [29013], [46468], [8605], [7254], [25896], [5032], [8168], [36893], [38270], [20499], [27501], [34419], [29547], [28571], [36586], [20871], [30537], [26842], [21375], [31148], [27618], [33094], [3291], [31789], [28391], [870], [9793], [41361], [47916], [27468], [43856], [8850], [35237], [15707], [47552], [2730], [41449], [45488], [3073], [49806], [21938], [24430], [22747], [20924], [46145], [20481], [20197], [8239], [28231], [17987], [42804], [47269], [29972], [49884], [21382], [46295], [36676], [34616], [3921], [26991], [27720], [46265], [654], [9855], [40354], [5291], [34904], [44342], [2470], [14598], [880], [19282], [2498], [24237], [21431], [16369], [8994], [44524], [45662], [13663], [37077], [1447], [37786], [30863], [42854], [1019], [20322], [4398], [12159], [44072], [48664], [31547], [18736], [9259], [31], [16354], [21810], [4357], [37982], [5064], [2033], [32871], [47446], [62], [22158], [37387], [8743], [47007], [17981], [11049], [4622], [37916], [36786], [35138], [29925], [14157], [18095], [27829], [1181], [22226], [5709], [4725], [30189], [37014], [1254], [11380], [42989], [696], [24576], [39487], [30119], [1092], [8088], [2194], [9899], [14412], [21828], [3725], [13544], [5180], [44679], [34398], [3891], [28739], [14219], [37594], [49550], [11326], [6904], [17266], [5749], [10174], [23405], [9955], [38271], [41018], [13011], [48392], [36784], [24254], [21687], [23734], [5413], [41447], [45472], [10122], [17555], [15830], [47384], [12084], [31350], [47940], [11661], [27988], [45443], [905], [49651], [16614], [34993], [6781], [30803], [35869], [8001], [41604], [28118], [46462], [46762], [16262], [17281], [5774], [10943], [5013], [18257], [6750], [4713], [3951], [11899], [38791], [16943], [37596], [9318], [18413], [40473], [13208], [16375]] \ No newline at end of file +badwordsids_neox = [[0], [1], [44162], [9502], [12520], [31841], [36320], [49824], [34417], [6038], [34494], [24815], [26635], [24345], [3455], [28905], [44270], [17278], [32666], [46880], [7086], [43189], [37322], [17778], [20879], [49821], [3138], [14490], [4681], [21391], [26786], [43134], [9336], [683], [48074], [41256], [19181], [29650], [28532], [36487], [45114], [46275], [16445], [15104], [11337], [1168], [5647], [29], [27482], [44965], [43782], [31011], [42944], [47389], [6334], [17548], [38329], [32044], [35487], [2239], [34761], [7444], [1084], [12399], [18990], [17636], [39083], [1184], [35830], [28365], [16731], [43467], [47744], [1138], [16079], [40116], [45564], [18297], [42368], [5456], [18022], [42696], [34476], [23505], [23741], [39334], [37944], [45382], [38709], [33440], [26077], [43600], [34418], [36033], [6660], [48167], [48471], [15775], [19884], [41533], [1008], [31053], [36692], [46576], [20095], [20629], [31759], [46410], [41000], [13488], [30952], [39258], [16160], [27655], [22367], [42767], [43736], [49694], [13811], [12004], [46768], [6257], [37471], [5264], [44153], [33805], [20977], [21083], [25416], [14277], [31096], [42041], [18331], [33376], [22372], [46294], [28379], [38475], [1656], [5204], [27075], [50001], [16616], [11396], [7748], [48744], [35402], [28120], [41512], [4207], [43144], [14767], [15640], [16595], [41305], [44479], [38958], [18474], [22734], [30522], [46267], [60], [13976], [31830], [48701], [39822], [9014], [21966], [31422], [28052], [34607], [2479], [3851], [32214], [44082], [45507], [3001], [34368], [34758], [13380], [38363], [4299], [46802], [30996], [12630], [49236], [7082], [8795], [5218], [44740], [9686], [9983], [45301], [27114], [40125], [1570], [26997], [544], [5290], [49193], [23781], [14193], [40000], [2947], [43781], [9102], [48064], [42274], [18772], [49384], [9884], [45635], [43521], [31258], [32056], [47686], [21760], [13143], [10148], [26119], [44308], [31379], [36399], [23983], [46694], [36134], [8562], [12977], [35117], [28591], [49021], [47093], [28653], [29013], [46468], [8605], [7254], [25896], [5032], [8168], [36893], [38270], [20499], [27501], [34419], [29547], [28571], [36586], [20871], [30537], [26842], [21375], [31148], [27618], [33094], [3291], [31789], [28391], [870], [9793], [41361], [47916], [27468], [43856], [8850], [35237], [15707], [47552], [2730], [41449], [45488], [3073], [49806], [21938], [24430], [22747], [20924], [46145], [20481], [20197], [8239], [28231], [17987], [42804], [47269], [29972], [49884], [21382], [46295], [36676], [34616], [3921], [26991], [27720], [46265], [654], [9855], [40354], [5291], [34904], [44342], [2470], [14598], [880], [19282], [2498], [24237], [21431], [16369], [8994], [44524], [45662], [13663], [37077], [1447], [37786], [30863], [42854], [1019], [20322], [4398], [12159], [44072], [48664], [31547], [18736], [9259], [31], [16354], [21810], [4357], [37982], [5064], [2033], [32871], [47446], [62], [22158], [37387], [8743], [47007], [17981], [11049], [4622], [37916], [36786], [35138], [29925], [14157], [18095], [27829], [1181], [22226], [5709], [4725], [30189], [37014], [1254], [11380], [42989], [696], [24576], [39487], [30119], [1092], [8088], [2194], [9899], [14412], [21828], [3725], [13544], [5180], [44679], [34398], [3891], [28739], [14219], [37594], [49550], [11326], [6904], [17266], [5749], [10174], [23405], [9955], [38271], [41018], [13011], [48392], [36784], [24254], [21687], [23734], [5413], [41447], [45472], [10122], [17555], [15830], [47384], [12084], [31350], [47940], [11661], [27988], [45443], [905], [49651], [16614], [34993], [6781], [30803], [35869], [8001], [41604], [28118], [46462], [46762], [16262], [17281], [5774], [10943], [5013], [18257], [6750], [4713], [3951], [11899], [38791], [16943], [37596], [9318], [18413], [40473], [13208], [16375]] diff --git a/static/koboldai.js b/static/koboldai.js index cce66f80..05dcc47e 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -1472,6 +1472,7 @@ function show_model_menu(data) { document.getElementById("modelurl").classList.add("hidden"); document.getElementById("use_gpu_div").classList.add("hidden"); document.getElementById("use_8_bit_div").classList.add("hidden"); + document.getElementById("use_4_bit_div").classList.add("hidden"); document.getElementById("modellayers").classList.add("hidden"); document.getElementById("oaimodel").classList.add("hidden"); var model_layer_bars = document.getElementById('model_layer_bars'); @@ -1646,6 +1647,14 @@ function selected_model_info(data) { document.getElementById("use_8_bit").checked = false; } + //hide or unhide 4 bit mode + if (data.bit_4_available) { + document.getElementById("use_4_bit_div").classList.remove("hidden"); + } else { + document.getElementById("use_4_bit_div").classList.add("hidden"); + document.getElementById("use_4_bit").checked = false; + } + //default URL loading if (data.default_url != null) { document.getElementById("modelurl").value = data.default_url; @@ -1815,7 +1824,7 @@ function selected_model_info(data) { } accept.disabled = false; - + set_4_bit_mode(invert=false); } function update_gpu_layers() { @@ -1876,7 +1885,8 @@ function load_model() { 'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(), 'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value, 'online_model': selected_models, - 'use_8_bit': document.getElementById('use_8_bit').checked}; + 'use_8_bit': document.getElementById('use_8_bit').checked, + 'use_4_bit': document.getElementById('use_4_bit').checked}; socket.emit("load_model", message); closePopups(); } @@ -3160,6 +3170,22 @@ function save_preset() { closePopups(); } +function set_4_bit_mode(invert=true) { + bit_4_status = document.getElementById("use_4_bit").checked; + if (invert) { + bit_4_status = !bit_4_status; + } + if (bit_4_status) { + document.getElementById("modellayers").classList.add("hidden"); + socket.emit("use_4_bit_toggle", {"use_4_bit": false}); + } else { + document.getElementById("modellayers").classList.remove("hidden"); + socket.emit("use_4_bit_toggle", {"use_4_bit": true}); + } +} + + + //--------------------------------------------General UI Functions------------------------------------ function set_ui_level(level) { for (classname of ['setting_container', 'setting_container_single', 'setting_container_single_wide', 'biasing', 'palette_area', 'advanced_theme']) { @@ -7301,4 +7327,4 @@ $el("#gamescreen").addEventListener("paste", function(event) { false, event.clipboardData.getData("text/plain") ); -}); \ No newline at end of file +}); diff --git a/templates/popups.html b/templates/popups.html index 44cf7cb6..804b1b9f 100644 --- a/templates/popups.html +++ b/templates/popups.html @@ -75,6 +75,10 @@
Use 8 bit mode
+ @@ -402,4 +406,4 @@ -
\ No newline at end of file +
From c7edc764b95d44603e4d450d4326ce3628188ef3 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 21 Mar 2023 21:58:31 +0000 Subject: [PATCH 012/113] Fix llama loading --- aiserver.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/aiserver.py b/aiserver.py index 7497dfb9..967af85f 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2915,10 +2915,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal @functools.lru_cache(maxsize=None) def get_original_key(key): - # try: - return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) - # except ValueError: - # return key + try: + return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) + except ValueError: + return key for key, value in model_dict.items(): original_key = get_original_key(key) @@ -3128,8 +3128,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") if not os.path.isfile(path_4bit): - print(f"4-bit file {path_4bit} not found, aborting 4-bit load") - use_4_bit = False + print(f"4-bit file {path_4bit} not found, loading failed") + raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") if use_4_bit: print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") @@ -3155,7 +3155,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) if model is None: raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.") From 8941428c66c377baa10aa95afd3186733dd92b89 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 22 Mar 2023 06:22:34 +0000 Subject: [PATCH 013/113] Fix Kobold loading to CPU in 4-bit, causing CUDA ASSERT error --- aiserver.py | 6 ++++-- repos/gptq | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index 967af85f..2c50cfcc 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3102,7 +3102,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors - if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel: + if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel and not use_4_bit: device_config(model_config) # Download model from Huggingface if it does not exist, otherwise load locally @@ -3133,6 +3133,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if use_4_bit: print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") + koboldai_vars.breakmodel = False + koboldai_vars.usegpu = True if koboldai_vars.model_type == "gptj": model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) @@ -3255,7 +3257,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal generator = model.generate elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) koboldai_vars.modeldim = get_hidden_size_from_model(model) - if(not koboldai_vars.lazy_load): + if(not koboldai_vars.lazy_load and not use_4_bit): device_config(model.config) move_model_to_devices(model, use_4_bit) elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): diff --git a/repos/gptq b/repos/gptq index a8303654..791cfe37 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit a8303654c200c25577130466e5f9bc1e70fc8a50 +Subproject commit 791cfe376af33aa01032dd52147050083a6345cf From 026eb3205e0f48dac5a4aa965d3e48d79ec5e1ab Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 22 Mar 2023 22:12:06 +0000 Subject: [PATCH 014/113] Fix 4-bit loading error when not loading in 4-bit --- aiserver.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/aiserver.py b/aiserver.py index 2c50cfcc..745a7cb8 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3125,13 +3125,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal lowmem = {} if(os.path.isdir(koboldai_vars.custmodpth)): - path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") - - if not os.path.isfile(path_4bit): - print(f"4-bit file {path_4bit} not found, loading failed") - raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") - if use_4_bit: + path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") + + if not os.path.isfile(path_4bit): + print(f"4-bit file {path_4bit} not found, loading failed") + raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") + print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") koboldai_vars.breakmodel = False koboldai_vars.usegpu = True From 9dcba3897810499786d1fb4b4bd8d41ef595a130 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 24 Mar 2023 19:07:28 +0000 Subject: [PATCH 015/113] Pin transformers to a working Llama-compatible version --- environments/huggingface.yml | 2 +- environments/rocm.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 42dda9c3..6807627e 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -30,7 +30,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - git+https://github.com/zphang/transformers@llama_push + - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24 - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc - huggingface_hub==0.12.1 - safetensors diff --git a/environments/rocm.yml b/environments/rocm.yml index 43fd331f..a1d3d8b0 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -29,7 +29,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers==4.25.1 + - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24 - huggingface_hub==0.12.1 - safetensors - accelerate From 2e7a8a1a66a3813ff2f68b5e37f659479f44afc2 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 23 Mar 2023 05:53:30 +0000 Subject: [PATCH 016/113] Adapt KoboldAI to latest gptq changes --- aiserver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index 745a7cb8..faee85c0 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3136,13 +3136,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.breakmodel = False koboldai_vars.usegpu = True if koboldai_vars.model_type == "gptj": - model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "gpt_neox": - model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "llama": - model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") From d1a2005a2710e0720fe2a863ebe4f5d1f9b2ad18 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 27 Mar 2023 20:45:21 +0000 Subject: [PATCH 017/113] Add support for old and new 4-bit format. Old one needs 4bit-old.pt file to launch --- aiserver.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/aiserver.py b/aiserver.py index faee85c0..fa2af0f3 100644 --- a/aiserver.py +++ b/aiserver.py @@ -94,7 +94,6 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) from gptj import load_quant as gptj_load_quant from gptneox import load_quant as gptneox_load_quant from llama import load_quant as llama_load_quant -vars_4bit = {} if lupa.LUA_VERSION[:2] != (5, 4): @@ -3127,9 +3126,29 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if use_4_bit: path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") + path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt") + + # Monkey-patch in old-format pt-file support + if not os.path.isfile(path_4bit): + print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}") + path_4bit = path_4bit_old + + import llama, opt, gptneox, gptj, old_quant, quant_cuda_old + llama.make_quant = old_quant.old_make_quant + opt.make_quant = old_quant.old_make_quant + gptneox.make_quant = old_quant.old_make_quant + gptj.make_quant = old_quant.old_make_quant + elif llama.make_quant == old_quant.old_make_quant: + # Undo monkey patch + import quant + llama.make_quant = quant.make_quant + opt.make_quant = quant.make_quant + gptneox.make_quant = quant.make_quant + gptj.make_quant = quant.make_quant + if not os.path.isfile(path_4bit): - print(f"4-bit file {path_4bit} not found, loading failed") + print(f"4-bit old-format file {path_4bit} not found, loading failed") raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") From 0f1fc46078f9a751e35c0c5e7e35d091a10f3f9b Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 27 Mar 2023 21:30:43 +0000 Subject: [PATCH 018/113] Fix errors during inference --- aiserver.py | 14 +++++++++++--- repos/gptq | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/aiserver.py b/aiserver.py index fa2af0f3..2c2eff1b 100644 --- a/aiserver.py +++ b/aiserver.py @@ -94,6 +94,7 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) from gptj import load_quant as gptj_load_quant from gptneox import load_quant as gptneox_load_quant from llama import load_quant as llama_load_quant +monkey_patched_4bit = False if lupa.LUA_VERSION[:2] != (5, 4): @@ -3128,23 +3129,28 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt") + global monkey_patched_4bit + # Monkey-patch in old-format pt-file support if not os.path.isfile(path_4bit): print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}") path_4bit = path_4bit_old - import llama, opt, gptneox, gptj, old_quant, quant_cuda_old + import llama, opt, gptneox, gptj, old_quant llama.make_quant = old_quant.old_make_quant opt.make_quant = old_quant.old_make_quant gptneox.make_quant = old_quant.old_make_quant gptj.make_quant = old_quant.old_make_quant - elif llama.make_quant == old_quant.old_make_quant: + monkey_patched_4bit = True + elif monkey_patched_4bit: # Undo monkey patch - import quant + print("Undoing 4-bit old format monkey patch") + import llama, opt, gptneox, gptj, quant llama.make_quant = quant.make_quant opt.make_quant = quant.make_quant gptneox.make_quant = quant.make_quant gptj.make_quant = quant.make_quant + monkey_patched_4bit = False if not os.path.isfile(path_4bit): @@ -3165,6 +3171,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") + + model = model.float() else: try: tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) diff --git a/repos/gptq b/repos/gptq index 791cfe37..0748a680 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 791cfe376af33aa01032dd52147050083a6345cf +Subproject commit 0748a680e95ab0a9f8860953a5d705a01070d1cc From ef6fe680a97efb740db946c0e4fbf5d2dd54889b Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 28 Mar 2023 06:30:02 +0000 Subject: [PATCH 019/113] Fix high VRAM usage caused by workaround for scalar type error --- aiserver.py | 2 +- repos/gptq | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aiserver.py b/aiserver.py index 2c2eff1b..27cafd59 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3172,7 +3172,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") - model = model.float() + model = model.half() else: try: tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) diff --git a/repos/gptq b/repos/gptq index 0748a680..5d07f25a 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 0748a680e95ab0a9f8860953a5d705a01070d1cc +Subproject commit 5d07f25a30f8602aedb3e69f11de07624e486ce9 From e698f22706c806e05fdd8c58f91f3d560bcba0d6 Mon Sep 17 00:00:00 2001 From: Digitous <107712289+Digitous@users.noreply.github.com> Date: Tue, 28 Mar 2023 19:14:46 -0400 Subject: [PATCH 020/113] Update README.md --- README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/README.md b/README.md index 20a1957a..c6e922aa 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,57 @@ +## This is a fork of KoboldAI that implements 4bit GPTQ quantized support to include Llama. + +### Install/Use Guide +(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use) + +In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created. + +git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules + +cd KoboldAI + +Next step, subfolder mode or B: option doesn't matter choose either + +[if on Windows] install_requirements.bat if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. + +[if on Linux] install_requirements.sh + + +[if on Windows] run commandline.bat + +[if on Linux] run commandline.sh + +commandline.bat/commandline.sh will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt). + + +cd repos + +cd gptq + + +[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] python setup_cuda.py install + +[if on Linux] python setup_cuda.py install + +After the Cuda kernel is compiled, return to KoboldAI base directory + +[if on Windows (only applies to windows users)] pip install flask_cors + +If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder) + +Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). + +Then move your model folder to KoboldAI/models, and rename the .pt in your model folder to 4bit.pt + +So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below). + +If you haven't done so already, exit the command prompt/leave KAI's (base) venv + +Run play.bat [windows] or play.sh [linux] + +Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on. + +The 4bit toggle shows when a model to load is selected. + ## KoboldAI - Your gateway to GPT writing This is a browser-based front-end for AI-assisted writing with multiple local & remote AI models. It offers the standard array of tools, including Memory, Author's Note, World Info, Save & Load, adjustable AI settings, formatting options, and the ability to import existing AI Dungeon adventures. You can also turn on Adventure mode and play the game like AI Dungeon Unleashed. From 8d008b87a608beb47e5f41473a40b437aa33d4b4 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 13:25:06 +0000 Subject: [PATCH 021/113] Add OPT support --- aiserver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/aiserver.py b/aiserver.py index 27cafd59..edce6bf1 100644 --- a/aiserver.py +++ b/aiserver.py @@ -94,6 +94,7 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) from gptj import load_quant as gptj_load_quant from gptneox import load_quant as gptneox_load_quant from llama import load_quant as llama_load_quant +from opt import load_quant as opt_load_quant monkey_patched_4bit = False @@ -3169,6 +3170,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal elif koboldai_vars.model_type == "llama": model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) + elif koboldai_vars.model_type == "opt": + model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") From f6f7687cc015821c4d4b4cff7dbfea1052514efb Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 14:47:59 +0000 Subject: [PATCH 022/113] Add 4bit safetensor support, improve loading code --- aiserver.py | 78 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/aiserver.py b/aiserver.py index edce6bf1..2679ddc8 100644 --- a/aiserver.py +++ b/aiserver.py @@ -90,6 +90,7 @@ global tpu_mtj_backend # 4-bit dependencies from pathlib import Path +import glob sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) from gptj import load_quant as gptj_load_quant from gptneox import load_quant as gptneox_load_quant @@ -2657,6 +2658,50 @@ def unload_model(): #Reload our badwords koboldai_vars.badwordsids = koboldai_settings.badwordsids_default + + +def prepare_4bit_load(modelpath): + paths_4bit = ["4bit.pt", "4bit.safetensors"] + paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] + result = False + for p in paths_4bit: + p = os.path.join(modelpath, p) + if os.path.isfile(p): + result = p + break + + global monkey_patched_4bit + + # Monkey-patch in old-format pt-file support + if not result: + print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}") + for p in paths_4bit_old: + p = os.path.join(modelpath, p) + if os.path.isfile(p): + result = p + break + + if not result: + print(f"4-bit old-format file {path_4bit} not found, loading failed") + raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") + + import llama, opt, gptneox, gptj, old_quant + llama.make_quant = old_quant.old_make_quant + opt.make_quant = old_quant.old_make_quant + gptneox.make_quant = old_quant.old_make_quant + gptj.make_quant = old_quant.old_make_quant + monkey_patched_4bit = True + elif monkey_patched_4bit: + # Undo monkey patch + print("Undoing 4-bit old format monkey patch") + import llama, opt, gptneox, gptj, quant + llama.make_quant = quant.make_quant + opt.make_quant = quant.make_quant + gptneox.make_quant = quant.make_quant + gptj.make_quant = quant.make_quant + monkey_patched_4bit = False + + return result def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False): @@ -3127,36 +3172,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(os.path.isdir(koboldai_vars.custmodpth)): if use_4_bit: - path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") - path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt") - - global monkey_patched_4bit - - # Monkey-patch in old-format pt-file support - if not os.path.isfile(path_4bit): - print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}") - path_4bit = path_4bit_old - - import llama, opt, gptneox, gptj, old_quant - llama.make_quant = old_quant.old_make_quant - opt.make_quant = old_quant.old_make_quant - gptneox.make_quant = old_quant.old_make_quant - gptj.make_quant = old_quant.old_make_quant - monkey_patched_4bit = True - elif monkey_patched_4bit: - # Undo monkey patch - print("Undoing 4-bit old format monkey patch") - import llama, opt, gptneox, gptj, quant - llama.make_quant = quant.make_quant - opt.make_quant = quant.make_quant - gptneox.make_quant = quant.make_quant - gptj.make_quant = quant.make_quant - monkey_patched_4bit = False - - - if not os.path.isfile(path_4bit): - print(f"4-bit old-format file {path_4bit} not found, loading failed") - raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") + path_4bit = prepare_4bit_load(koboldai_vars.custmodpth) print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") koboldai_vars.breakmodel = False @@ -3171,7 +3187,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "opt": - model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") From a0bc77042624571b878d734ebc41331f6f4d9342 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 19:49:05 +0000 Subject: [PATCH 023/113] Add basic groupsize support Write groupsize into filename, for example 4bit-128b.safetensors for groupsize 128 --- aiserver.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/aiserver.py b/aiserver.py index 2679ddc8..38805287 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2661,13 +2661,19 @@ def unload_model(): def prepare_4bit_load(modelpath): - paths_4bit = ["4bit.pt", "4bit.safetensors"] + paths_4bit = ["4bit*.safetensors", "4bit*.pt"] paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] result = False + groupsize = -1 for p in paths_4bit: p = os.path.join(modelpath, p) - if os.path.isfile(p): - result = p + val = glob.glob(p) + if val: + result = val[0] + fname = Path(result).parts[-1] + g = re.findall("^(?:4bit)(?:-)(\d+)(?:b-?)", fname) + if g: + groupsize = int(g[0]) break global monkey_patched_4bit @@ -2701,7 +2707,7 @@ def prepare_4bit_load(modelpath): gptj.make_quant = quant.make_quant monkey_patched_4bit = False - return result + return result, groupsize def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False): @@ -3172,22 +3178,23 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(os.path.isdir(koboldai_vars.custmodpth)): if use_4_bit: - path_4bit = prepare_4bit_load(koboldai_vars.custmodpth) + path_4bit, groupsize = prepare_4bit_load(koboldai_vars.custmodpth) + print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}") print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") koboldai_vars.breakmodel = False koboldai_vars.usegpu = True if koboldai_vars.model_type == "gptj": - model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) + model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "gpt_neox": - model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) + model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "llama": - model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) + model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "opt": - model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) + model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") From 73d5ec0e5dd234852a66331b681734e8beb13781 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 20:07:26 +0000 Subject: [PATCH 024/113] Pull latest gptq-changes --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index 5d07f25a..6f80e1fd 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 5d07f25a30f8602aedb3e69f11de07624e486ce9 +Subproject commit 6f80e1fdd7232d66e45e02ebf00be4c5d5f933bc From 9d0477f5f73471995fa3e23789a0ac4aa9108b33 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 22:05:44 +0000 Subject: [PATCH 025/113] Fix bug where it picks old model despite new one available --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 38805287..812bc4a8 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2667,7 +2667,7 @@ def prepare_4bit_load(modelpath): groupsize = -1 for p in paths_4bit: p = os.path.join(modelpath, p) - val = glob.glob(p) + val = [v for v in glob.glob(p) if "4bit-old" not in v] if val: result = val[0] fname = Path(result).parts[-1] From 61b13604b6ad116561488ab146c3959f40d98099 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 30 Mar 2023 10:57:04 +0200 Subject: [PATCH 026/113] Fix bug in 4-bit load fallback --- aiserver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index 812bc4a8..fe0f9a8c 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2680,7 +2680,7 @@ def prepare_4bit_load(modelpath): # Monkey-patch in old-format pt-file support if not result: - print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}") + print("4-bit file not found, falling back to old format.") for p in paths_4bit_old: p = os.path.join(modelpath, p) if os.path.isfile(p): @@ -2688,8 +2688,8 @@ def prepare_4bit_load(modelpath): break if not result: - print(f"4-bit old-format file {path_4bit} not found, loading failed") - raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") + print("4-bit old-format file not found, loading failed.") + raise RuntimeError(f"4-bit load failed. PT-File not found.") import llama, opt, gptneox, gptj, old_quant llama.make_quant = old_quant.old_make_quant From aa2292b3a4dff467e9afaa3270d80fcda4c7994f Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 21:43:49 +0000 Subject: [PATCH 027/113] Enable multi-gpu support --- aiserver.py | 50 ++++++++++++++++------------------------------ static/koboldai.js | 9 +-------- 2 files changed, 18 insertions(+), 41 deletions(-) diff --git a/aiserver.py b/aiserver.py index fe0f9a8c..7a4370c0 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1139,7 +1139,7 @@ def move_model_to_devices(model, use_4_bit=False): import accelerate.utils for key, value in model.state_dict().items(): target_dtype = torch.float32 if breakmodel.primary_device == "cpu" else torch.float16 - if(value.dtype is not target_dtype): + if(value.dtype not in (torch.bool, torch.int) and value.dtype is not target_dtype): accelerate.utils.set_module_tensor_to_device(model, key, target_dtype) disk_blocks = breakmodel.disk_blocks gpu_blocks = breakmodel.gpu_blocks @@ -1919,18 +1919,6 @@ def get_cluster_models(msg): emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2") -@socketio.on("use_4_bit_toggle") -def use_4_bit_toggle(msg): - # Disable lazy_load and breakmodel - if msg["use_4_bit"]: - koboldai_vars.lazy_load = False - koboldai_vars.nobreakmodel = True - else: - koboldai_vars.lazy_load = True - koboldai_vars.nobreakmodel = False - - # TODO: Reload JS values for this stuff - # Function to patch transformers to use our soft prompt def patch_causallm(model): from torch.nn import Embedding @@ -3033,11 +3021,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3) #print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) model_dict[key] = model_dict[key].materialize(f, map_location="cpu") - if not use_4_bit: - if model_dict[key].dtype is torch.float32: - koboldai_vars.fp32_model = True - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: - model_dict[key] = model_dict[key].to(torch.float16) + if model_dict[key].dtype is torch.float32: + koboldai_vars.fp32_model = True + if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: + model_dict[key] = model_dict[key].to(torch.float16) if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16): model_dict[key] = model_dict[key].to(torch.float32) if device == "shared": @@ -3061,17 +3048,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if utils.offload_index: for name, tensor in utils.named_buffers: dtype = tensor.dtype - if not use_4_bit: - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): - dtype = torch.float16 - if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): - dtype = torch.float32 - if name in model_dict and model_dict[name].dtype is not dtype: - model_dict[name] = model_dict[name].to(dtype) - if tensor.dtype is not dtype: - tensor = tensor.to(dtype) - if name not in utils.offload_index: - accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) + if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): + dtype = torch.float16 + if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): + dtype = torch.float32 + if name in model_dict and model_dict[name].dtype is not dtype: + model_dict[name] = model_dict[name].to(dtype) + if tensor.dtype is not dtype: + tensor = tensor.to(dtype) + if name not in utils.offload_index: + accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache") utils.bar.close() utils.bar = None @@ -3154,7 +3140,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors - if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel and not use_4_bit: + if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel: device_config(model_config) # Download model from Huggingface if it does not exist, otherwise load locally @@ -3182,8 +3168,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}") print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") - koboldai_vars.breakmodel = False - koboldai_vars.usegpu = True if koboldai_vars.model_type == "gptj": model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) @@ -3311,7 +3295,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal generator = model.generate elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) koboldai_vars.modeldim = get_hidden_size_from_model(model) - if(not koboldai_vars.lazy_load and not use_4_bit): + if(not koboldai_vars.lazy_load): device_config(model.config) move_model_to_devices(model, use_4_bit) elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): diff --git a/static/koboldai.js b/static/koboldai.js index 05dcc47e..89ee2ea1 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -3173,14 +3173,7 @@ function save_preset() { function set_4_bit_mode(invert=true) { bit_4_status = document.getElementById("use_4_bit").checked; if (invert) { - bit_4_status = !bit_4_status; - } - if (bit_4_status) { - document.getElementById("modellayers").classList.add("hidden"); - socket.emit("use_4_bit_toggle", {"use_4_bit": false}); - } else { - document.getElementById("modellayers").classList.remove("hidden"); - socket.emit("use_4_bit_toggle", {"use_4_bit": true}); + bit_4_status = !bit_4_status; } } From 6eae4574793687b517c45e85e5fc178015c8d088 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 31 Mar 2023 15:36:03 +0200 Subject: [PATCH 028/113] Fix 4bit groupsize param letter Use g instead of b for groupsize name, for example 4bit-128g.safetensors --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 7a4370c0..e7c789ac 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2659,7 +2659,7 @@ def prepare_4bit_load(modelpath): if val: result = val[0] fname = Path(result).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\d+)(?:b-?)", fname) + g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname) if g: groupsize = int(g[0]) break From d3a5ca65057f4f7cf9a2998cd13e5e04de829df1 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 1 Apr 2023 08:52:08 +0000 Subject: [PATCH 029/113] Update gptq submodule to latest --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index 6f80e1fd..f4de1019 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 6f80e1fdd7232d66e45e02ebf00be4c5d5f933bc +Subproject commit f4de1019fedce779915e050a517f4cc8dee432dc From bf0c999412b48a6de6a174a33bce3f8b92df1e16 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 1 Apr 2023 14:19:51 +0200 Subject: [PATCH 030/113] Update GPTQ to support AMD --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index f4de1019..954b3218 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit f4de1019fedce779915e050a517f4cc8dee432dc +Subproject commit 954b32183adda2acd437a3ab0683a28ca3c7e4c9 From 110f8229c565a1ac64060e4e1785d4563920d4f4 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 1 Apr 2023 21:33:05 +0200 Subject: [PATCH 031/113] Add cudatoolkit-dev for compilation, compatible gcc 9 and update transformers to fix error in gptq --- environments/huggingface.yml | 5 ++++- environments/rocm.yml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 6807627e..71d26e9c 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -11,6 +11,9 @@ dependencies: - pytorch=1.11.* - python=3.8.* - cudatoolkit=11.1 + - cudatoolkit-dev=11.1 + - gcc=9.* + - gxx=9.* - eventlet=0.33.3 - dnspython=2.2.1 - markdown @@ -30,7 +33,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24 + - git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc - huggingface_hub==0.12.1 - safetensors diff --git a/environments/rocm.yml b/environments/rocm.yml index a1d3d8b0..dda2a2b2 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -29,7 +29,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24 + - git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda - huggingface_hub==0.12.1 - safetensors - accelerate From 2729b7764047b7c1d35f7a20e5900d61147fe598 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 2 Apr 2023 10:32:19 +0200 Subject: [PATCH 032/113] Add offload.py adapted from llama_inference_offload.py, with multi-gpu support and some improvements. Not yet functional, and still just supports Llama --- aiserver.py | 17 +++++++++++++++-- repos/gptq | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index e7c789ac..82992461 100644 --- a/aiserver.py +++ b/aiserver.py @@ -96,6 +96,7 @@ from gptj import load_quant as gptj_load_quant from gptneox import load_quant as gptneox_load_quant from llama import load_quant as llama_load_quant from opt import load_quant as opt_load_quant +from offload import load_quant_offload monkey_patched_4bit = False @@ -3137,6 +3138,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.model_type == "gpt2"): lowmem = {} koboldai_vars.lazy_load = False # Also, lazy loader doesn't support GPT-2 models + + gpu_layers_list = [int(l) for l in gpu_layers.split(",")] + offload_4bit = use_4_bit and sum(gpu_layers_list) < utils.num_layers(model_config) + + if offload_4bit: + koboldai_vars.lazy_load = False # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors @@ -3175,7 +3182,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "llama": - model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + if offload_4bit: + model = load_quant_offload(llama_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "opt": model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) @@ -3286,7 +3296,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal patch_causallm(model) if(koboldai_vars.hascuda): - if(koboldai_vars.usegpu): + if offload_4bit: + koboldai_vars.modeldim = get_hidden_size_from_model(model) + generator = model.generate + elif(koboldai_vars.usegpu): koboldai_vars.modeldim = get_hidden_size_from_model(model) if not use_4_bit: model = model.half().to(koboldai_vars.gpu_device) diff --git a/repos/gptq b/repos/gptq index 954b3218..f8bc2886 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 954b32183adda2acd437a3ab0683a28ca3c7e4c9 +Subproject commit f8bc2886cb2e2aaa704ea02404c2ff3841eb6fcf From e742083703ea8111379492c75e62f9dfffd54a28 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 2 Apr 2023 11:17:29 +0200 Subject: [PATCH 033/113] Fix multi-gpu-offloading --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index f8bc2886..971a5785 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit f8bc2886cb2e2aaa704ea02404c2ff3841eb6fcf +Subproject commit 971a5785a356460f2073b0141da0c1e0b8fdcbf6 From c8d00b7a10fd48f31f9d3fc4f4010f5481c772d4 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 2 Apr 2023 18:36:31 +0200 Subject: [PATCH 034/113] Add CPU offloading support for GPT-NeoX, GPT-J and OPT --- aiserver.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index 82992461..2365f58b 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3144,6 +3144,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if offload_4bit: koboldai_vars.lazy_load = False + print("4-bit CPU offloader active") # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors @@ -3176,10 +3177,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") if koboldai_vars.model_type == "gptj": - model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + if offload_4bit: + model = load_quant_offload(gptj_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "gpt_neox": - model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + if offload_4bit: + model = load_quant_offload(gptneox_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "llama": if offload_4bit: @@ -3188,7 +3195,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "opt": - model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + if offload_4bit: + model = load_quant_offload(opt_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") From ec4177a6d6cf3549f3aebffc1a54b4799c506657 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 3 Apr 2023 06:50:36 +0200 Subject: [PATCH 035/113] Remove cudatoolkit-dev and gcc/gxx 9 from conda env because they didn't resolve on Windows --- environments/huggingface.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 71d26e9c..b48c2547 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -11,9 +11,6 @@ dependencies: - pytorch=1.11.* - python=3.8.* - cudatoolkit=11.1 - - cudatoolkit-dev=11.1 - - gcc=9.* - - gxx=9.* - eventlet=0.33.3 - dnspython=2.2.1 - markdown From b9df9b6f590388a8fc6139e25b1d1c24c21fac52 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 3 Apr 2023 20:27:17 +0200 Subject: [PATCH 036/113] Improve CPU offloading speed significantly when offloading less than half of the layers --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index 971a5785..e2f567e9 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 971a5785a356460f2073b0141da0c1e0b8fdcbf6 +Subproject commit e2f567e9332c2d92f1c26ea0f7c935fe7cd65655 From ce6761e74436298424d3ea7bb964bb243e8cd88a Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 4 Apr 2023 07:46:53 +0200 Subject: [PATCH 037/113] Fix issue causing expected scalar type Float but found Half RuntimeErrors --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index e2f567e9..08c5054d 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit e2f567e9332c2d92f1c26ea0f7c935fe7cd65655 +Subproject commit 08c5054d45b8c6277e74a35841570dc7b8cbc608 From 8b4375307c2e4ea1154125fea1e00ef8c1b38415 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 5 Apr 2023 21:10:40 +0200 Subject: [PATCH 038/113] Update file formatting section in guide --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c6e922aa..0296e876 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ If you haven't already done so, create a model folder with the same name as your Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). -Then move your model folder to KoboldAI/models, and rename the .pt in your model folder to 4bit.pt +Then move your model folder to KoboldAI/models, and rename the .pt/safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-g.pt` or `4bit-.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`) So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below). From 40092cc9faed0d225391699e4cada1b9fb043dff Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 5 Apr 2023 21:49:13 +0200 Subject: [PATCH 039/113] Improve guide formatting --- README.md | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 0296e876..e103bbff 100644 --- a/README.md +++ b/README.md @@ -5,48 +5,46 @@ In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created. -git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules +`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules` -cd KoboldAI +`cd KoboldAI` -Next step, subfolder mode or B: option doesn't matter choose either +Next step, (Windows) subfolder mode or B: option doesn't matter choose either -[if on Windows] install_requirements.bat if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. +[if on Windows] `install_requirements.bat` if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. -[if on Linux] install_requirements.sh +[if on Linux] `install_requirements.sh` -[if on Windows] run commandline.bat +[if on Windows] run `commandline.bat` -[if on Linux] run commandline.sh +[if on Linux] run `commandline.sh` -commandline.bat/commandline.sh will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt). +`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt). -cd repos +`cd repos` -cd gptq +`cd gptq` -[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] python setup_cuda.py install +[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] `python setup_cuda.py install` -[if on Linux] python setup_cuda.py install +[if on Linux] `python setup_cuda.py install` After the Cuda kernel is compiled, return to KoboldAI base directory -[if on Windows (only applies to windows users)] pip install flask_cors - If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder) -Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). +Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). -Then move your model folder to KoboldAI/models, and rename the .pt/safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-g.pt` or `4bit-.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`) +Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-g.pt` or `4bit-.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`) So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below). If you haven't done so already, exit the command prompt/leave KAI's (base) venv -Run play.bat [windows] or play.sh [linux] +Run `play.bat` [windows] or `play.sh` [linux] Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on. From 636c4e5a5284fa2a11af7aba2fdf55426047eb0f Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 7 Apr 2023 11:48:57 +0200 Subject: [PATCH 040/113] Update gptq repo --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index 08c5054d..17c46a59 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 08c5054d45b8c6277e74a35841570dc7b8cbc608 +Subproject commit 17c46a59ff20da657e68f3267f853b0243b983bc From 7efd314428e0ad24b33fc9cd9ac19b45c6754e7b Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 7 Apr 2023 20:10:24 +0200 Subject: [PATCH 041/113] Improve guide --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 86661df3..f9be9660 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,11 @@ Next step, (Windows) subfolder mode or B: option doesn't matter choose either [if on Linux] run `commandline.sh` -`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt). - +`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment. +On Windows, this will show (base) at the start of the prompt line. +If it shows (base) on Linux, you most likely have a base conda environment that you need to deactivate (`conda deactivate`) +Then run `cd repos` `cd gptq` @@ -42,7 +44,7 @@ Then move your model folder to KoboldAI/models, and rename the .pt or .safetenso So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below). -If you haven't done so already, exit the command prompt/leave KAI's (base) venv +If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux) Run `play.bat` [windows] or `play.sh` [linux] From b628aec7194783da09035a3b8fe01f674df542ea Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 10 Apr 2023 22:37:16 +0200 Subject: [PATCH 042/113] Automatic installation of the quant_cuda module during install_requirements Kepler (K40+) and Maxwell support --- install_requirements.bat | 4 ++++ install_requirements.sh | 3 +++ repos/gptq | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/install_requirements.bat b/install_requirements.bat index 2a4534c1..05264259 100644 --- a/install_requirements.bat +++ b/install_requirements.bat @@ -49,6 +49,8 @@ umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\h umamba.exe -r B:\ clean -a -y rd B:\Python\pkgs /S /Q subst B: /d +call B:\python\condabin\activate +cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" pause exit @@ -60,5 +62,7 @@ umamba.exe create -r miniconda3\ -n base umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy umamba.exe clean -a -y rd miniconda3\Python\pkgs /S /Q +call miniconda3\condabin\activate +cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" pause exit diff --git a/install_requirements.sh b/install_requirements.sh index 6f0e0dfd..7b5a8d5b 100755 --- a/install_requirements.sh +++ b/install_requirements.sh @@ -5,6 +5,9 @@ wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar - bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y + +# Install quant_cuda module for 4-bit +bin/micromamba run -r runtime -n koboldai pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl exit fi if [[ $1 = "rocm" ]]; then diff --git a/repos/gptq b/repos/gptq index 17c46a59..50b22e2b 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 17c46a59ff20da657e68f3267f853b0243b983bc +Subproject commit 50b22e2ba8ec0f5cf0dca719392a2ec5254e7228 From 687d107d20345a0cc46bb069914d0ce6a3bcf43d Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 10 Apr 2023 22:46:12 +0200 Subject: [PATCH 043/113] Update README, remove steps that are no longer required --- README.md | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/README.md b/README.md index f9be9660..0657fa0b 100644 --- a/README.md +++ b/README.md @@ -15,27 +15,6 @@ Next step, (Windows) subfolder mode or B: option doesn't matter choose either [if on Linux] `install_requirements.sh` - -[if on Windows] run `commandline.bat` - -[if on Linux] run `commandline.sh` - -`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment. -On Windows, this will show (base) at the start of the prompt line. -If it shows (base) on Linux, you most likely have a base conda environment that you need to deactivate (`conda deactivate`) - -Then run -`cd repos` - -`cd gptq` - - -[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] `python setup_cuda.py install` - -[if on Linux] `python setup_cuda.py install` - -After the Cuda kernel is compiled, return to KoboldAI base directory - If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder) Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). From 35f908e147fcac121bdafaf7ca4b751d8091f480 Mon Sep 17 00:00:00 2001 From: biscober <50845461+biscober@users.noreply.github.com> Date: Tue, 11 Apr 2023 02:37:48 +0000 Subject: [PATCH 044/113] Update install_requirements.bat (#7) * Update install_requirements.bat move command to dismount temp B drive to after pip install command which requires B drive to still be mounted * Update install_requirements.bat cmd /k not necessary * Update install_requirements.bat add quotes (probably not required but w/e) --- install_requirements.bat | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/install_requirements.bat b/install_requirements.bat index 05264259..3b735ddf 100644 --- a/install_requirements.bat +++ b/install_requirements.bat @@ -48,9 +48,9 @@ umamba.exe create -r B:\python\ -n base umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy umamba.exe -r B:\ clean -a -y rd B:\Python\pkgs /S /Q -subst B: /d call B:\python\condabin\activate -cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" +pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" +subst B: /d pause exit @@ -63,6 +63,6 @@ umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingf umamba.exe clean -a -y rd miniconda3\Python\pkgs /S /Q call miniconda3\condabin\activate -cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" +pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" pause exit From 4d34f9b7de03c6843e05cf5e11864d6b180a07b5 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 16 Apr 2023 14:20:13 +0200 Subject: [PATCH 045/113] Move 4-bit loading code to separate inference_model file --- aiserver.py | 91 ++--- modeling/inference_models/hf_torch_4bit.py | 385 +++++++++++++++++++++ 2 files changed, 412 insertions(+), 64 deletions(-) create mode 100644 modeling/inference_models/hf_torch_4bit.py diff --git a/aiserver.py b/aiserver.py index 7e9241f5..0a98d16f 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1776,56 +1776,6 @@ def unload_model(): #Reload our badwords koboldai_vars.badwordsids = koboldai_settings.badwordsids_default - - -def prepare_4bit_load(modelpath): - paths_4bit = ["4bit*.safetensors", "4bit*.pt"] - paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] - result = False - groupsize = -1 - for p in paths_4bit: - p = os.path.join(modelpath, p) - val = [v for v in glob.glob(p) if "4bit-old" not in v] - if val: - result = val[0] - fname = Path(result).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname) - if g: - groupsize = int(g[0]) - break - - global monkey_patched_4bit - - # Monkey-patch in old-format pt-file support - if not result: - print("4-bit file not found, falling back to old format.") - for p in paths_4bit_old: - p = os.path.join(modelpath, p) - if os.path.isfile(p): - result = p - break - - if not result: - print("4-bit old-format file not found, loading failed.") - raise RuntimeError(f"4-bit load failed. PT-File not found.") - - import llama, opt, gptneox, gptj, old_quant - llama.make_quant = old_quant.old_make_quant - opt.make_quant = old_quant.old_make_quant - gptneox.make_quant = old_quant.old_make_quant - gptj.make_quant = old_quant.old_make_quant - monkey_patched_4bit = True - elif monkey_patched_4bit: - # Undo monkey patch - print("Undoing 4-bit old format monkey patch") - import llama, opt, gptneox, gptj, quant - llama.make_quant = quant.make_quant - opt.make_quant = quant.make_quant - gptneox.make_quant = quant.make_quant - gptj.make_quant = quant.make_quant - monkey_patched_4bit = False - - return result, groupsize def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False): @@ -2008,9 +1958,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except: pass - try: - from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel - model = GenericHFTorchInferenceModel( + if use_4_bit: + from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel + model = HFTorch4BitInferenceModel( koboldai_vars.model, lazy_load=koboldai_vars.lazy_load, low_mem=args.lowmem @@ -2020,18 +1970,31 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal save_model=not (args.colab or args.cacheonly) or args.savemodel, initial_load=initial_load, ) - except SuperLegacyModelError: - from modeling.inference_models.legacy_gpt2_hf import CustomGPT2HFTorchInferenceModel - model = CustomGPT2HFTorchInferenceModel( - koboldai_vars.model, - lazy_load=koboldai_vars.lazy_load, - low_mem=args.lowmem - ) + else: + try: + from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel + model = GenericHFTorchInferenceModel( + koboldai_vars.model, + lazy_load=koboldai_vars.lazy_load, + low_mem=args.lowmem + ) - model.load( - save_model=not (args.colab or args.cacheonly) or args.savemodel, - initial_load=initial_load, - ) + model.load( + save_model=not (args.colab or args.cacheonly) or args.savemodel, + initial_load=initial_load, + ) + except SuperLegacyModelError: + from modeling.inference_models.legacy_gpt2_hf import CustomGPT2HFTorchInferenceModel + model = CustomGPT2HFTorchInferenceModel( + koboldai_vars.model, + lazy_load=koboldai_vars.lazy_load, + low_mem=args.lowmem + ) + + model.load( + save_model=not (args.colab or args.cacheonly) or args.savemodel, + initial_load=initial_load, + ) logger.info(f"Pipeline created: {koboldai_vars.model}") else: diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py new file mode 100644 index 00000000..21f4ebfe --- /dev/null +++ b/modeling/inference_models/hf_torch_4bit.py @@ -0,0 +1,385 @@ +from __future__ import annotations + +import os +import json +import torch +import re +import shutil +import sys +from typing import Union + +from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer +from modeling.inference_model import SuperLegacyModelError + +import utils +import modeling.lazy_loader as lazy_loader +import koboldai_settings +from logger import logger, set_logger_verbosity, quiesce_logger + +try: + import breakmodel +except ModuleNotFoundError as e: + # Breakmodel is only expected to work on GPU + if not utils.koboldai_vars.use_colab_tpu: + raise e + +from modeling.inference_models.hf_torch import HFTorchInferenceModel + +# 4-bit dependencies +from pathlib import Path +import glob +sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) +from gptj import load_quant as gptj_load_quant +from gptneox import load_quant as gptneox_load_quant +from llama import load_quant as llama_load_quant +from opt import load_quant as opt_load_quant +from offload import load_quant_offload +monkey_patched_4bit = False + + +def prepare_4bit_load(modelpath): + paths_4bit = ["4bit*.safetensors", "4bit*.pt"] + paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] + result = False + groupsize = -1 + for p in paths_4bit: + p = os.path.join(modelpath, p) + val = [v for v in glob.glob(p) if "4bit-old" not in v] + if val: + result = val[0] + fname = Path(result).parts[-1] + g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname) + if g: + groupsize = int(g[0]) + break + + global monkey_patched_4bit + + # Monkey-patch in old-format pt-file support + if not result: + print("4-bit file not found, falling back to old format.") + for p in paths_4bit_old: + p = os.path.join(modelpath, p) + if os.path.isfile(p): + result = p + break + + if not result: + print("4-bit old-format file not found, loading failed.") + raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.") + + import llama, opt, gptneox, gptj, old_quant + llama.make_quant = old_quant.old_make_quant + opt.make_quant = old_quant.old_make_quant + gptneox.make_quant = old_quant.old_make_quant + gptj.make_quant = old_quant.old_make_quant + monkey_patched_4bit = True + elif monkey_patched_4bit: + # Undo monkey patch + print("Undoing 4-bit old format monkey patch") + import llama, opt, gptneox, gptj, quant + llama.make_quant = quant.make_quant + opt.make_quant = quant.make_quant + gptneox.make_quant = quant.make_quant + gptj.make_quant = quant.make_quant + monkey_patched_4bit = False + + return result, groupsize + + +class HFTorch4BitInferenceModel(HFTorchInferenceModel): + def _load(self, save_model: bool, initial_load: bool) -> None: + utils.koboldai_vars.allowsp = True + + # Make model path the same as the model name to make this consistent + # with the other loading method if it isn't a known model type. This + # code is not just a workaround for below, it is also used to make the + # behavior consistent with other loading methods - Henk717 + # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]: + # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model + + if self.model_name == "NeoCustom": + self.model_name = os.path.basename( + os.path.normpath(utils.koboldai_vars.custmodpth) + ) + utils.koboldai_vars.model = self.model_name + + self.lazy_load = False + + self.init_model_config() + + gpulayers = utils.args.breakmodel_gpulayers + + try: + gpu_layers_list = [int(l) for l in gpulayers.split(",")] + except ValueError: + gpu_layers_list = [utils.num_layers(self.model_config)] + self.offload_4bit = sum(gpu_layers_list) < utils.num_layers(self.model_config) + + if self.offload_4bit: + utils.koboldai_vars.lazy_load = False + print("4-bit CPU offloader active") + + tf_kwargs = { + "low_cpu_mem_usage": True, + } + + # If we're using torch_lazy_loader, we need to get breakmodel config + # early so that it knows where to load the individual model tensors + if ( + self.lazy_load + and utils.koboldai_vars.hascuda + and utils.koboldai_vars.breakmodel + and not utils.koboldai_vars.nobreakmodel + ): + self.breakmodel_device_config(self.model_config) + + if self.lazy_load: + # If we're using lazy loader, we need to figure out what the model's hidden layers are called + with lazy_loader.use_lazy_load( + dematerialized_modules=True, use_accelerate_init_empty_weights=True + ): + try: + metamodel = AutoModelForCausalLM.from_config(self.model_config) + except Exception as e: + logger.error(f"Fell back to neo for metamodel due to {e}") + try: + metamodel = GPTNeoForCausalLM.from_config(self.model_config) + except Exception as e: + logger.error(f"Falling back again due to {e}") + raise SuperLegacyModelError + + utils.layers_module_names = utils.get_layers_module_names(metamodel) + utils.module_names = list(metamodel.state_dict().keys()) + utils.named_buffers = list(metamodel.named_buffers(recurse=True)) + + # Download model from Huggingface if it does not exist, otherwise load locally + with self._maybe_use_float16(), lazy_loader.use_lazy_load( + enable=self.lazy_load, + callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) + if self.lazy_load + else None, + dematerialized_modules=True, + ): + if self.lazy_load: + # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time + tf_kwargs.pop("low_cpu_mem_usage", None) + + if self.get_local_model_path(): + # Model is stored locally, load it. + self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + else: + # Model not stored locally, we need to download it. + + # _rebuild_tensor patch for casting dtype and supporting LazyTensors + old_rebuild_tensor = torch._utils._rebuild_tensor + + def new_rebuild_tensor( + storage: Union[lazy_loader.LazyTensor, torch.Storage], + storage_offset, + shape, + stride, + ): + if not isinstance(storage, lazy_loader.LazyTensor): + dtype = storage.dtype + else: + dtype = storage.storage_type.dtype + if not isinstance(dtype, torch.dtype): + dtype = storage.storage_type(0).dtype + if dtype is torch.float32 and len(shape) >= 2: + utils.koboldai_vars.fp32_model = True + return old_rebuild_tensor(storage, storage_offset, shape, stride) + + torch._utils._rebuild_tensor = new_rebuild_tensor + self.model = self._get_model(self.model_name, tf_kwargs) + self.tokenizer = self._get_tokenizer(self.model_name) + torch._utils._rebuild_tensor = old_rebuild_tensor + + if save_model: + self.tokenizer.save_pretrained( + self.get_local_model_path(ignore_existance=True) + ) + + if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks: + # Use save_pretrained to convert fp32 models to fp16, + # unless we are using disk cache because save_pretrained + # is not supported in that case + self.model = self.model.half() + self.model.save_pretrained( + self.get_local_model_path(ignore_existance=True), + max_shard_size="500MiB", + ) + + else: + # For fp16 models, we can just copy the model files directly + import transformers.configuration_utils + import transformers.modeling_utils + import transformers.file_utils + import huggingface_hub + + # Save the config.json + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, + transformers.configuration_utils.CONFIG_NAME, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.configuration_utils.CONFIG_NAME, + ), + ) + + if utils.num_shards is None: + # Save the pytorch_model.bin or model.safetensors of an unsharded model + any_success = False + possible_checkpoint_names = [ + transformers.modeling_utils.WEIGHTS_NAME, + "model.safetensors", + ] + + for possible_checkpoint_name in possible_checkpoint_names: + try: + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, + possible_checkpoint_name, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path( + ignore_existance=True + ), + possible_checkpoint_name, + ), + ) + any_success = True + except Exception: + pass + + if not any_success: + raise RuntimeError(f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'") + else: + # Handle saving sharded models + + with open(utils.from_pretrained_index_filename) as f: + map_data = json.load(f) + filenames = set(map_data["weight_map"].values()) + # Save the pytorch_model.bin.index.json of a sharded model + shutil.move( + os.path.realpath(utils.from_pretrained_index_filename), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.modeling_utils.WEIGHTS_INDEX_NAME, + ), + ) + # Then save the pytorch_model-#####-of-#####.bin files + for filename in filenames: + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, + filename, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path( + ignore_existance=True + ), + filename, + ), + ) + shutil.rmtree("cache/") + + if not self.lazy_load: + utils.layers_module_names = utils.get_layers_module_names(self.model) + utils.module_names = list(self.model.state_dict().keys()) + utils.named_buffers = list(self.model.named_buffers(recurse=True)) + + if ( + utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default + and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj") + ): + utils.koboldai_vars.badwordsids = [ + [v] + for k, v in self.tokenizer.get_vocab().items() + if any(c in str(k) for c in "[]") + ] + + self.patch_embedding() + + if utils.koboldai_vars.hascuda: + if utils.koboldai_vars.usegpu: + # Use just VRAM + self.model = self.model.half().to(utils.koboldai_vars.gpu_device) + elif utils.koboldai_vars.breakmodel: + # Use both RAM and VRAM (breakmodel) + if not self.lazy_load: + self.breakmodel_device_config(self.model.config) + self._move_to_devices() + elif breakmodel.disk_blocks > 0: + # Use disk + self._move_to_devices() + else: + # Use CPU + self.model = self.model.to("cpu").float() + elif breakmodel.disk_blocks > 0: + self._move_to_devices() + else: + self.model = self.model.to("cpu").float() + + self.model.kai_model = self + utils.koboldai_vars.modeldim = self.get_hidden_size() + + def _get_model(self, location: str, tf_kwargs: Dict): + path_4bit, groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth) + print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}") + + print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit") + if utils.koboldai_vars.model_type == "gptj": + if self.offload_4bit: + model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = gptj_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + elif utils.koboldai_vars.model_type == "gpt_neox": + if self.offload_4bit: + model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = gptneox_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + elif utils.koboldai_vars.model_type == "llama": + if self.offload_4bit: + model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = llama_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + elif utils.koboldai_vars.model_type == "opt": + if self.offload_4bit: + model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = opt_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + else: + raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit") + + return model.half() + + def _get_tokenizer(self, location: str): + if utils.koboldai_vars.model_type == "llama": + tokenizer = LlamaTokenizer.from_pretrained(utils.koboldai_vars.custmodpth) + else: + tokenizer = AutoTokenizer.from_pretrained(utils.koboldai_vars.custmodpth) + + return tokenizer From ded5542d3a78be4d9c0e79486cd387f285acce42 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 16 Apr 2023 21:11:35 +0200 Subject: [PATCH 046/113] Fix error in 4bit offloading initialization code when running with --nobreakmodel --- aiserver.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index a7583d2c..913bea5c 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3171,7 +3171,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal lowmem = {} koboldai_vars.lazy_load = False # Also, lazy loader doesn't support GPT-2 models - gpu_layers_list = [int(l) for l in gpu_layers.split(",")] + try: + gpu_layers_list = [int(l) for l in gpu_layers.split(",")] + except ValueError: + gpu_layers_list = [utils.num_layers(model_config)] offload_4bit = use_4_bit and sum(gpu_layers_list) < utils.num_layers(model_config) if offload_4bit: From 1ef515f4c22fc48241f0b825bb47004df17990f9 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 17 Apr 2023 07:21:18 +0200 Subject: [PATCH 047/113] Fix lazy-loading on 4-bit --- modeling/inference_models/hf_torch.py | 19 +++++++++++-------- modeling/inference_models/hf_torch_4bit.py | 2 -- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index a2b2ff80..53b02e6d 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -412,14 +412,17 @@ class HFTorchInferenceModel(HFInferenceModel): @functools.lru_cache(maxsize=None) def get_original_key(key): - return max( - ( - original_key - for original_key in utils.module_names - if original_key.endswith(key) - ), - key=len, - ) + try: + return max( + ( + original_key + for original_key in utils.module_names + if original_key.endswith(key) + ), + key=len, + ) + except ValueError: + return key for key, value in model_dict.items(): original_key = get_original_key(key) diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 21f4ebfe..4b02d642 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -104,8 +104,6 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): ) utils.koboldai_vars.model = self.model_name - self.lazy_load = False - self.init_model_config() gpulayers = utils.args.breakmodel_gpulayers From 12699aa22950fe33912c82bf11ac8bc8a3487299 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 17 Apr 2023 07:26:03 +0200 Subject: [PATCH 048/113] Show 4-bit toggle without experimental ui --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 0a98d16f..21290f37 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1602,7 +1602,7 @@ def get_model_info(model, directory=""): 'break_values': break_values, 'gpu_count': gpu_count, 'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select, 'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False, - 'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False, + 'bit_4_available': koboldai_vars.bit_4_available, 'show_custom_model_box': show_custom_model_box}) if send_horde_models: get_cluster_models({'key': key_value, 'url': default_url}) From 10c99a853c207c34d163914042b903d606dad8ee Mon Sep 17 00:00:00 2001 From: nerodiafasciata Date: Wed, 12 Apr 2023 21:37:44 -0500 Subject: [PATCH 049/113] Added AMD instructions, added formatting Added AMD install instructions Formatted the install/run section for improved readability --- README.md | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0657fa0b..170c4f42 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ ### Install/Use Guide (This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use) +#### Installation In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created. `git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules` @@ -11,10 +12,28 @@ In the command prompt/command line navigate to where you want the KoboldAI subfo Next step, (Windows) subfolder mode or B: option doesn't matter choose either -[if on Windows] `install_requirements.bat` if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. +* [if on Windows] + ``` + install_requirements.bat + ``` + * if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. -[if on Linux] `install_requirements.sh` +* [if on Linux with Nvidia] + ``` + ./install_requirements.sh + ``` +* [if on Linux with AMD] + ``` + ./install_requirements.sh rocm + ./commandline-rocm.sh + cd repos/gptq + python setup_cuda.py install + ``` + * If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed + * If you get CUDA_HOME envar is not set run in env: + `pip3 install torch --index-url https://download.pytorch.org/whl/rocm5.4.2 --force-reinstall` +#### Setting up models If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder) Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). @@ -23,9 +42,10 @@ Then move your model folder to KoboldAI/models, and rename the .pt or .safetenso So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below). +#### Running KoboldAI and loading 4bit models If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux) -Run `play.bat` [windows] or `play.sh` [linux] +Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD] Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on. From 934571857ba986202c895d93efe95a58fbcc6308 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 18 Apr 2023 22:52:54 +0200 Subject: [PATCH 050/113] Fix offloading --- modeling/inference_models/hf_torch_4bit.py | 37 ++++++---------------- repos/gptq | 2 +- 2 files changed, 11 insertions(+), 28 deletions(-) diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 4b02d642..be504d4f 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -109,10 +109,10 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): gpulayers = utils.args.breakmodel_gpulayers try: - gpu_layers_list = [int(l) for l in gpulayers.split(",")] + self.gpu_layers_list = [int(l) for l in gpulayers.split(",")] except ValueError: - gpu_layers_list = [utils.num_layers(self.model_config)] - self.offload_4bit = sum(gpu_layers_list) < utils.num_layers(self.model_config) + self.gpu_layers_list = [utils.num_layers(self.model_config)] + self.offload_4bit = sum(self.gpu_layers_list) < utils.num_layers(self.model_config) if self.offload_4bit: utils.koboldai_vars.lazy_load = False @@ -321,25 +321,8 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): self.patch_embedding() - if utils.koboldai_vars.hascuda: - if utils.koboldai_vars.usegpu: - # Use just VRAM - self.model = self.model.half().to(utils.koboldai_vars.gpu_device) - elif utils.koboldai_vars.breakmodel: - # Use both RAM and VRAM (breakmodel) - if not self.lazy_load: - self.breakmodel_device_config(self.model.config) - self._move_to_devices() - elif breakmodel.disk_blocks > 0: - # Use disk - self._move_to_devices() - else: - # Use CPU - self.model = self.model.to("cpu").float() - elif breakmodel.disk_blocks > 0: - self._move_to_devices() - else: - self.model = self.model.to("cpu").float() + if not self.offload_4bit: + self.model = self.model.half().to(utils.koboldai_vars.gpu_device) self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() @@ -351,28 +334,28 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit") if utils.koboldai_vars.model_type == "gptj": if self.offload_4bit: - model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) else: model = gptj_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) elif utils.koboldai_vars.model_type == "gpt_neox": if self.offload_4bit: - model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) else: model = gptneox_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) elif utils.koboldai_vars.model_type == "llama": if self.offload_4bit: - model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) else: model = llama_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) elif utils.koboldai_vars.model_type == "opt": if self.offload_4bit: - model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) else: model = opt_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) else: raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit") - return model.half() + return model.half() if not self.offload_4bit else model def _get_tokenizer(self, location: str): if utils.koboldai_vars.model_type == "llama": diff --git a/repos/gptq b/repos/gptq index 50b22e2b..5d94e5fb 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 50b22e2ba8ec0f5cf0dca719392a2ec5254e7228 +Subproject commit 5d94e5fb2a03a432d9cbb0db95493ac33b0bfd71 From 8d61d6b04ab7c100db4871fb33c7b7eec835ccc4 Mon Sep 17 00:00:00 2001 From: nerodiafasciata Date: Tue, 25 Apr 2023 00:25:28 -0500 Subject: [PATCH 051/113] install instruction update: don't run as admin (#12) * Update README.md Added note to tell windows users not to install as admin --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 170c4f42..67fe881a 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ #### Installation In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created. +Note: do not run your command prompt as administrator/with elevated priviledges, reports suggest this leads to problems. + `git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules` `cd KoboldAI` From b58e5f353febf4c20f5ae2194b369f7e9160420a Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 25 Apr 2023 18:56:25 +0200 Subject: [PATCH 052/113] Add wheel links file for pip --- docs/gptq-whl-links.html | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 docs/gptq-whl-links.html diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html new file mode 100644 index 00000000..c612b5e1 --- /dev/null +++ b/docs/gptq-whl-links.html @@ -0,0 +1,2 @@ +quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl +quant_cuda-0.0.0-cp38-cp38-win_amd64.whl From cd289a947824ad52daa9192363115b5322dbf749 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 25 Apr 2023 19:06:25 +0200 Subject: [PATCH 053/113] Use custom pip repo for wheels instead of modifying install_requirements scripts --- environments/huggingface.yml | 2 ++ install_requirements.bat | 4 ---- install_requirements.sh | 3 --- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index daa25e1f..35580603 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -45,3 +45,5 @@ dependencies: - ftfy - pydub - diffusers + - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html + - quant_cuda diff --git a/install_requirements.bat b/install_requirements.bat index 3b735ddf..2a4534c1 100644 --- a/install_requirements.bat +++ b/install_requirements.bat @@ -48,8 +48,6 @@ umamba.exe create -r B:\python\ -n base umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy umamba.exe -r B:\ clean -a -y rd B:\Python\pkgs /S /Q -call B:\python\condabin\activate -pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" subst B: /d pause exit @@ -62,7 +60,5 @@ umamba.exe create -r miniconda3\ -n base umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy umamba.exe clean -a -y rd miniconda3\Python\pkgs /S /Q -call miniconda3\condabin\activate -pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" pause exit diff --git a/install_requirements.sh b/install_requirements.sh index 7b5a8d5b..6f0e0dfd 100755 --- a/install_requirements.sh +++ b/install_requirements.sh @@ -5,9 +5,6 @@ wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar - bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y - -# Install quant_cuda module for 4-bit -bin/micromamba run -r runtime -n koboldai pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl exit fi if [[ $1 = "rocm" ]]; then From 99c4c3bae4956e7190beb6909a42d7debd033553 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 17 Apr 2023 07:26:03 +0200 Subject: [PATCH 054/113] Show 4-bit toggle without experimental ui --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 85523734..2fc8990c 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1809,7 +1809,7 @@ def get_model_info(model, directory=""): 'break_values': break_values, 'gpu_count': gpu_count, 'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select, 'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False, - 'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False, + 'bit_4_available': koboldai_vars.bit_4_available, 'show_custom_model_box': show_custom_model_box}) if send_horde_models: get_cluster_models({'key': key_value, 'url': default_url}) From aedb6388c5f22b3bd99a0b8e17dc45d14c50e142 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 27 Apr 2023 07:05:11 +0200 Subject: [PATCH 055/113] Update README, remove experimental UI --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 67fe881a..aadfd345 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ If you haven't done so already, exit the command prompt/leave KAI's conda env. ( Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD] -Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on. +Switch to UI2, then load your model and be sure 4-bit toggle is on. The 4bit toggle shows when a model to load is selected. From 18ac5dfce6398a561c4521356f7187e6977a7c61 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 27 Apr 2023 16:04:30 +0200 Subject: [PATCH 056/113] Update to Pytorch 1.13.1 and CUDA 11.7 --- docs/gptq-whl-links.html | 4 ++-- environments/huggingface.yml | 6 ++++-- repos/gptq | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html index c612b5e1..710a43b8 100644 --- a/docs/gptq-whl-links.html +++ b/docs/gptq-whl-links.html @@ -1,2 +1,2 @@ -quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl -quant_cuda-0.0.0-cp38-cp38-win_amd64.whl +quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl +quant_cuda-0.0.0-cp38-cp38-win_amd64.whl diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 35580603..b1b86c45 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -1,6 +1,7 @@ name: koboldai channels: - pytorch + - nvidia - conda-forge - defaults dependencies: @@ -8,9 +9,10 @@ dependencies: - flask-socketio=5.3.2 - flask-session=0.4.0 - python-socketio=5.7.2 - - pytorch=1.11.* + - pytorch=1.13.1 + - pytorch-cuda=11.7 - python=3.8.* - - cudatoolkit=11.1 + - cudatoolkit=11.7 - eventlet=0.33.3 - dnspython=2.2.1 - markdown diff --git a/repos/gptq b/repos/gptq index 50b22e2b..3c16fd9c 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 50b22e2ba8ec0f5cf0dca719392a2ec5254e7228 +Subproject commit 3c16fd9c7946ebe85df8d951cb742adbc1966ec7 From 81f92ec402e07a07516a39859c98a616ecb47084 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 28 Apr 2023 10:55:22 +0200 Subject: [PATCH 057/113] Fix missing 4bit setting --- koboldai_settings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/koboldai_settings.py b/koboldai_settings.py index 1a4fcce6..7b7acac1 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1296,6 +1296,8 @@ class system_settings(settings): self.keep_img_gen_in_memory = False self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost self.experimental_features = False + # Check if repos/gptq exists for 4-bit mode + self.bit_4_available = os.path.isdir("repos/gptq") self.seen_messages = [] self.git_repository = "" self.git_branch = "" From 852005fef484846c648cb6bdd9b0e2091e75e486 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 28 Apr 2023 18:32:34 +0200 Subject: [PATCH 058/113] Always use GPU offloader if splitting across GPUs, this increases speed considerably --- aiserver.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/aiserver.py b/aiserver.py index 5c2b7533..578a2cff 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3186,9 +3186,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal gpu_layers_list = [int(l) for l in gpu_layers.split(",")] except ValueError: gpu_layers_list = [utils.num_layers(model_config)] - offload_4bit = use_4_bit and sum(gpu_layers_list) < utils.num_layers(model_config) - if offload_4bit: + if use_4_bit: koboldai_vars.lazy_load = False print("4-bit CPU offloader active") @@ -3223,28 +3222,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") if koboldai_vars.model_type == "gptj": - if offload_4bit: - model = load_quant_offload(gptj_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) - else: - model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(gptj_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "gpt_neox": - if offload_4bit: - model = load_quant_offload(gptneox_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) - else: - model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(gptneox_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "llama": - if offload_4bit: - model = load_quant_offload(llama_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) - else: - model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(llama_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "opt": - if offload_4bit: - model = load_quant_offload(opt_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) - else: - model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(opt_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") @@ -3352,7 +3339,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal patch_causallm(model) if(koboldai_vars.hascuda): - if offload_4bit: + if use_4_bit: koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate elif(koboldai_vars.usegpu): From 20a5587d660f651f108762ec99faf357a678285d Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 30 Apr 2023 18:17:43 +0200 Subject: [PATCH 059/113] Always use offloader script, because it speeds up multi gpu --- modeling/inference_models/hf_torch_4bit.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index be504d4f..98c9d785 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -333,25 +333,13 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit") if utils.koboldai_vars.model_type == "gptj": - if self.offload_4bit: - model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) - else: - model = gptj_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "gpt_neox": - if self.offload_4bit: - model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) - else: - model = gptneox_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "llama": - if self.offload_4bit: - model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) - else: - model = llama_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "opt": - if self.offload_4bit: - model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) - else: - model = opt_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) else: raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit") From aa67135d4280279fc50bc8223b582ec2fae38e11 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 30 Apr 2023 21:59:22 +0200 Subject: [PATCH 060/113] Implement new model format Remove 4bit toggle --- aiserver.py | 24 ++++++++-- koboldai_settings.py | 8 +++- modeling/inference_models/hf_torch_4bit.py | 55 +--------------------- static/koboldai.js | 21 +-------- 4 files changed, 31 insertions(+), 77 deletions(-) diff --git a/aiserver.py b/aiserver.py index ce5e3558..6b81eaf0 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1088,6 +1088,24 @@ def loadmodelsettings(): if(not koboldai_vars.gamestarted): koboldai_vars.authornotetemplate = koboldai_vars.setauthornotetemplate + gptq_legacy_files = glob.glob(os.path.join(koboldai_vars.custmodpth, "4bit*.pt")) + glob.glob(os.path.join(koboldai_vars.custmodpth, "4bit*.safetensors")) + if "gptq_bits" in js: + koboldai_vars.gptq_model = True + koboldai_vars.gptq_bits = js["gptq_bits"] + koboldai_vars.gptq_groupsize = js.get("gptq_groupsize", -1) + safetensors_file = os.path.join(koboldai_vars.custmodpth, "model.safetensors") + pt_file = os.path.join(koboldai_vars.custmodpth, "model.ckpt") + koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file + elif gptq_legacy_files: + koboldai_vars.gptq_model = True + koboldai_vars.gptq_bits = 4 + koboldai_vars.gptq_file = gptq_legacy_files[0] + fname = Path(koboldai_vars.gptq_file).parts[-1] + g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) + koboldai_vars.gptq_groupsize = int(g[0]) if g else -1 + else: + koboldai_vars.gptq_model = False + #==================================================================# # Take settings from koboldai_vars and write them to client settings file #==================================================================# @@ -1777,7 +1795,7 @@ def unload_model(): koboldai_vars.badwordsids = koboldai_settings.badwordsids_default -def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False): +def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False): global model global tokenizer global model_config @@ -1957,7 +1975,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except: pass - if use_4_bit: + if koboldai_vars.gptq_model: from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel model = HFTorch4BitInferenceModel( koboldai_vars.model, @@ -6495,7 +6513,7 @@ def UI_2_load_model(data): koboldai_vars.model = data['model'] koboldai_vars.custmodpth = data['path'] print("loading Model") - load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'], use_4_bit=data['use_4_bit']) + load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit']) #==================================================================# # Event triggered when load story is clicked diff --git a/koboldai_settings.py b/koboldai_settings.py index 56697573..c6560e32 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -919,7 +919,13 @@ class story_settings(settings): # In percent!!! self.commentary_chance = 0 self.commentary_enabled = False - + + # 4bit model vals + self.gptq_model = False + self.gptq_bits = -1 + self.gptq_groupsize = -1 + self.gptq_file = None + self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled")) ################### must be at bottom ######################### diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 98c9d785..a0e89436 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -27,64 +27,12 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel # 4-bit dependencies from pathlib import Path -import glob sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) from gptj import load_quant as gptj_load_quant from gptneox import load_quant as gptneox_load_quant from llama import load_quant as llama_load_quant from opt import load_quant as opt_load_quant from offload import load_quant_offload -monkey_patched_4bit = False - - -def prepare_4bit_load(modelpath): - paths_4bit = ["4bit*.safetensors", "4bit*.pt"] - paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] - result = False - groupsize = -1 - for p in paths_4bit: - p = os.path.join(modelpath, p) - val = [v for v in glob.glob(p) if "4bit-old" not in v] - if val: - result = val[0] - fname = Path(result).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname) - if g: - groupsize = int(g[0]) - break - - global monkey_patched_4bit - - # Monkey-patch in old-format pt-file support - if not result: - print("4-bit file not found, falling back to old format.") - for p in paths_4bit_old: - p = os.path.join(modelpath, p) - if os.path.isfile(p): - result = p - break - - if not result: - print("4-bit old-format file not found, loading failed.") - raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.") - - import llama, opt, gptneox, gptj, old_quant - llama.make_quant = old_quant.old_make_quant - opt.make_quant = old_quant.old_make_quant - gptneox.make_quant = old_quant.old_make_quant - gptj.make_quant = old_quant.old_make_quant - monkey_patched_4bit = True - elif monkey_patched_4bit: - # Undo monkey patch - print("Undoing 4-bit old format monkey patch") - import llama, opt, gptneox, gptj, quant - llama.make_quant = quant.make_quant - opt.make_quant = quant.make_quant - gptneox.make_quant = quant.make_quant - gptj.make_quant = quant.make_quant - monkey_patched_4bit = False - - return result, groupsize class HFTorch4BitInferenceModel(HFTorchInferenceModel): @@ -328,7 +276,8 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): utils.koboldai_vars.modeldim = self.get_hidden_size() def _get_model(self, location: str, tf_kwargs: Dict): - path_4bit, groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth) + path_4bit = utils.koboldai_vars.gptq_file + groupsize = utils.koboldai_vars.gptq_groupsize print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}") print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit") diff --git a/static/koboldai.js b/static/koboldai.js index 89ee2ea1..cc31899f 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -1472,7 +1472,6 @@ function show_model_menu(data) { document.getElementById("modelurl").classList.add("hidden"); document.getElementById("use_gpu_div").classList.add("hidden"); document.getElementById("use_8_bit_div").classList.add("hidden"); - document.getElementById("use_4_bit_div").classList.add("hidden"); document.getElementById("modellayers").classList.add("hidden"); document.getElementById("oaimodel").classList.add("hidden"); var model_layer_bars = document.getElementById('model_layer_bars'); @@ -1646,14 +1645,6 @@ function selected_model_info(data) { document.getElementById("use_8_bit_div").classList.add("hidden"); document.getElementById("use_8_bit").checked = false; } - - //hide or unhide 4 bit mode - if (data.bit_4_available) { - document.getElementById("use_4_bit_div").classList.remove("hidden"); - } else { - document.getElementById("use_4_bit_div").classList.add("hidden"); - document.getElementById("use_4_bit").checked = false; - } //default URL loading if (data.default_url != null) { @@ -1823,8 +1814,6 @@ function selected_model_info(data) { accept.classList.remove("disabled"); } accept.disabled = false; - - set_4_bit_mode(invert=false); } function update_gpu_layers() { @@ -1885,8 +1874,7 @@ function load_model() { 'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(), 'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value, 'online_model': selected_models, - 'use_8_bit': document.getElementById('use_8_bit').checked, - 'use_4_bit': document.getElementById('use_4_bit').checked}; + 'use_8_bit': document.getElementById('use_8_bit').checked}; socket.emit("load_model", message); closePopups(); } @@ -3170,13 +3158,6 @@ function save_preset() { closePopups(); } -function set_4_bit_mode(invert=true) { - bit_4_status = document.getElementById("use_4_bit").checked; - if (invert) { - bit_4_status = !bit_4_status; - } -} - //--------------------------------------------General UI Functions------------------------------------ From 9c3d578d6c3449f951e97be06b67bc7b84eff0ba Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 2 May 2023 21:32:20 +0200 Subject: [PATCH 061/113] Work on model download support --- aiserver.py | 32 +++---- modeling/inference_models/generic_hf_torch.py | 5 +- modeling/inference_models/hf.py | 8 ++ modeling/inference_models/hf_torch_4bit.py | 86 ++++++++++++++++--- 4 files changed, 98 insertions(+), 33 deletions(-) diff --git a/aiserver.py b/aiserver.py index 48e70854..81bb900f 100644 --- a/aiserver.py +++ b/aiserver.py @@ -50,6 +50,8 @@ import multiprocessing import numpy as np from collections import OrderedDict from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type +import glob +from pathlib import Path import requests import html @@ -86,18 +88,6 @@ allowed_ips = set() # empty set enable_whitelist = False -# 4-bit dependencies -from pathlib import Path -import glob -sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) -from gptj import load_quant as gptj_load_quant -from gptneox import load_quant as gptneox_load_quant -from llama import load_quant as llama_load_quant -from opt import load_quant as opt_load_quant -from offload import load_quant_offload -monkey_patched_4bit = False - - if lupa.LUA_VERSION[:2] != (5, 4): logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.") @@ -1974,6 +1964,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except: pass + if not koboldai_vars.gptq_model: + # Run generic HF model load_config first to check what model it is + from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel + model = GenericHFTorchInferenceModel( + koboldai_vars.model, + lazy_load=koboldai_vars.lazy_load, + low_mem=args.lowmem + ) + model.load_config() + if koboldai_vars.gptq_model: from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel model = HFTorch4BitInferenceModel( @@ -1981,14 +1981,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal lazy_load=koboldai_vars.lazy_load, low_mem=args.lowmem ) - else: - from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel - model = GenericHFTorchInferenceModel( - koboldai_vars.model, - lazy_load=koboldai_vars.lazy_load, - low_mem=args.lowmem - ) - model.load( save_model=not (args.colab or args.cacheonly) or args.savemodel, initial_load=initial_load, diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py index ce91b176..d45513aa 100644 --- a/modeling/inference_models/generic_hf_torch.py +++ b/modeling/inference_models/generic_hf_torch.py @@ -24,7 +24,7 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel class GenericHFTorchInferenceModel(HFTorchInferenceModel): - def _load(self, save_model: bool, initial_load: bool) -> None: + def load_config(self) -> None: utils.koboldai_vars.allowsp = True # Make model path the same as the model name to make this consistent @@ -50,6 +50,9 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): self.init_model_config() + def _load(self, save_model: bool, initial_load: bool) -> None: + self.load_config() + tf_kwargs = { "low_cpu_mem_usage": True, } diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index eae4bb2d..480da5d3 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -61,6 +61,14 @@ class HFInferenceModel(InferenceModel): cache_dir="cache", ) utils.koboldai_vars.model_type = self.model_config.model_type + + if "gptq_bits" in dir(self.model_config): + utils.koboldai_vars.gptq_model = True + utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits + utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize + utils.koboldai_vars.gptq_file = None + else: + utils.koboldai_vars.gptq_model = False except ValueError: utils.koboldai_vars.model_type = { "NeoCustom": "gpt_neo", diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index a0e89436..f0ff87b9 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import glob import json import torch import re @@ -9,7 +10,6 @@ import sys from typing import Union from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer -from modeling.inference_model import SuperLegacyModelError import utils import modeling.lazy_loader as lazy_loader @@ -33,6 +33,66 @@ from gptneox import load_quant as gptneox_load_quant from llama import load_quant as llama_load_quant from opt import load_quant as opt_load_quant from offload import load_quant_offload +monkey_patched_4bit = False + + +def prepare_4bit_load(modelpath): + path_4bit = os.path.join(modelpath, "model.safetensors") + if os.path.isfile(path_4bit): + return path_4bit, False + + path_4bit = os.path.join(modelpath, "model.ckpt") + if os.path.isfile(path_4bit): + return path_4bit, False + + # Legacy format support + paths_4bit = ["4bit*.safetensors", "4bit*.pt"] + paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] + result = False + groupsize = -1 + for p in paths_4bit: + p = os.path.join(modelpath, p) + val = [v for v in glob.glob(p) if "4bit-old" not in v] + if val: + result = val[0] + fname = Path(result).parts[-1] + g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) + if g: + groupsize = int(g[0]) + break + + global monkey_patched_4bit + + # Monkey-patch in old-format pt-file support + if not result: + print("4-bit file not found, falling back to old format.") + for p in paths_4bit_old: + p = os.path.join(modelpath, p) + if os.path.isfile(p): + result = p + break + + if not result: + print("4-bit old-format file not found, loading failed.") + raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.") + + import llama, opt, gptneox, gptj, old_quant + llama.make_quant = old_quant.old_make_quant + opt.make_quant = old_quant.old_make_quant + gptneox.make_quant = old_quant.old_make_quant + gptj.make_quant = old_quant.old_make_quant + monkey_patched_4bit = True + elif monkey_patched_4bit: + # Undo monkey patch + print("Undoing 4-bit old format monkey patch") + import llama, opt, gptneox, gptj, quant + llama.make_quant = quant.make_quant + opt.make_quant = quant.make_quant + gptneox.make_quant = quant.make_quant + gptj.make_quant = quant.make_quant + monkey_patched_4bit = False + + return result, groupsize class HFTorch4BitInferenceModel(HFTorchInferenceModel): @@ -87,17 +147,12 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): ): try: metamodel = AutoModelForCausalLM.from_config(self.model_config) + utils.layers_module_names = utils.get_layers_module_names(metamodel) + utils.module_names = list(metamodel.state_dict().keys()) + utils.named_buffers = list(metamodel.named_buffers(recurse=True)) except Exception as e: - logger.error(f"Fell back to neo for metamodel due to {e}") - try: - metamodel = GPTNeoForCausalLM.from_config(self.model_config) - except Exception as e: - logger.error(f"Falling back again due to {e}") - raise SuperLegacyModelError - - utils.layers_module_names = utils.get_layers_module_names(metamodel) - utils.module_names = list(metamodel.state_dict().keys()) - utils.named_buffers = list(metamodel.named_buffers(recurse=True)) + logger.warning(f"Gave up on lazy loading due to {e}") + self.lazy_load = False # Download model from Huggingface if it does not exist, otherwise load locally with self._maybe_use_float16(), lazy_loader.use_lazy_load( @@ -276,8 +331,15 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): utils.koboldai_vars.modeldim = self.get_hidden_size() def _get_model(self, location: str, tf_kwargs: Dict): - path_4bit = utils.koboldai_vars.gptq_file + if not utils.koboldai_vars.custmodpth: + pass groupsize = utils.koboldai_vars.gptq_groupsize + + path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth) + + if legacy_groupsize is not False: + groupsize = legacy_groupsize + print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}") print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit") From dd6644aaf06813ceada9c0d7f669f1dfbcb38a09 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 2 May 2023 22:11:28 +0200 Subject: [PATCH 062/113] Pytorch 2.0 (#18) * Update huggingface.yml to Pytorch 2.0 and CUDA 11.8 * Update github docs pip wheel hub Update ROCm requirements * Add rocm wheel --- docs/gptq-whl-links.html | 5 +++-- environments/huggingface.yml | 7 +++---- environments/rocm.yml | 8 +++++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html index 710a43b8..fed8b397 100644 --- a/docs/gptq-whl-links.html +++ b/docs/gptq-whl-links.html @@ -1,2 +1,3 @@ -quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl -quant_cuda-0.0.0-cp38-cp38-win_amd64.whl +quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl +quant_rocm-0.0.0-cp38-cp38-linux_x86_64.whl +quant_cuda-0.0.0-cp38-cp38-win_amd64.whl diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 8d5907ab..e5fb939c 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -10,10 +10,9 @@ dependencies: - flask-socketio=5.3.2 - flask-session=0.4.0 - python-socketio=5.7.2 - - pytorch=1.13.1 - - pytorch-cuda=11.7 + - pytorch=2.0.0 + - pytorch-cuda=11.8 - python=3.8.* - - cudatoolkit=11.7 - eventlet=0.33.3 - dnspython=2.2.1 - markdown @@ -34,7 +33,7 @@ dependencies: - flask-cors - lupa==1.10 - transformers==4.28.0 - - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc + - datasets - huggingface_hub==0.12.1 - safetensors - accelerate==0.18.0 diff --git a/environments/rocm.yml b/environments/rocm.yml index 91b63dbd..9358575d 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -24,14 +24,14 @@ dependencies: - Pillow - psutil - pip: - - --extra-index-url https://download.pytorch.org/whl/rocm5.2 - - torch==1.13.1+rocm5.2 + - --extra-index-url https://download.pytorch.org/whl/rocm5.4.2 + - torch==2.0.0+rocm5.4.2 - flask-cloudflared==0.0.10 - flask-ngrok - flask-cors - lupa==1.10 - transformers==4.28.0 - - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc + - datasets - huggingface_hub==0.12.1 - safetensors - accelerate==0.18.0 @@ -42,3 +42,5 @@ dependencies: - ftfy - pydub - diffusers + - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html + - quant_rocm From d48fedcbfb7ba8dca9623215822ab1cbb700612e Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 4 May 2023 18:31:37 +0200 Subject: [PATCH 063/113] Fix llama 4-bit loading error --- modeling/inference_models/hf_torch_4bit.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index f0ff87b9..10ef0e56 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -14,7 +14,7 @@ from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, import utils import modeling.lazy_loader as lazy_loader import koboldai_settings -from logger import logger, set_logger_verbosity, quiesce_logger +from logger import logger, set_logger_verbosity try: import breakmodel @@ -24,6 +24,7 @@ except ModuleNotFoundError as e: raise e from modeling.inference_models.hf_torch import HFTorchInferenceModel +from modeling.tokenizer import GenericTokenizer # 4-bit dependencies from pathlib import Path @@ -362,4 +363,4 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): else: tokenizer = AutoTokenizer.from_pretrained(utils.koboldai_vars.custmodpth) - return tokenizer + return GenericTokenizer(tokenizer) From 4180620999307a8eefb2bcd05e94161eb478243b Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 4 May 2023 19:52:56 +0200 Subject: [PATCH 064/113] Remove unnecessary changes, move gptq detection function to 4bit.py --- aiserver.py | 32 +--------------------- modeling/inference_models/hf_torch_4bit.py | 32 ++++++++++++++++++++++ static/koboldai.js | 8 +++--- templates/popups.html | 4 --- 4 files changed, 37 insertions(+), 39 deletions(-) diff --git a/aiserver.py b/aiserver.py index e4c0c521..80518450 100644 --- a/aiserver.py +++ b/aiserver.py @@ -601,6 +601,7 @@ utils.socketio = socketio # Weird import position to steal koboldai_vars from utils from modeling.patches import patch_transformers +from modeling.inference_models.hf_torch_4bit import load_model_gptq_settings old_socketio_on = socketio.on @@ -1078,37 +1079,6 @@ def loadmodelsettings(): koboldai_vars.authornotetemplate = koboldai_vars.setauthornotetemplate -def load_model_gptq_settings(): - try: - js = json.loads(str(model.model_config).partition(' ')[2]) - except Exception as e: - try: - try: - js = json.load(open(koboldai_vars.custmodpth + "/config.json", "r")) - except Exception as e: - js = json.load(open(koboldai_vars.custmodpth.replace('/', '_') + "/config.json", "r")) - except Exception as e: - js = {} - - gptq_legacy_files = glob.glob(os.path.join(koboldai_vars.custmodpth, "4bit*.pt")) + glob.glob(os.path.join(koboldai_vars.custmodpth, "4bit*.safetensors")) - if "gptq_bits" in js: - koboldai_vars.gptq_model = True - koboldai_vars.gptq_bits = js["gptq_bits"] - koboldai_vars.gptq_groupsize = js.get("gptq_groupsize", -1) - safetensors_file = os.path.join(koboldai_vars.custmodpth, "model.safetensors") - pt_file = os.path.join(koboldai_vars.custmodpth, "model.ckpt") - koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file - elif gptq_legacy_files: - koboldai_vars.gptq_model = True - koboldai_vars.gptq_bits = 4 - koboldai_vars.gptq_file = gptq_legacy_files[0] - fname = Path(koboldai_vars.gptq_file).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) - koboldai_vars.gptq_groupsize = int(g[0]) if g else -1 - else: - koboldai_vars.gptq_model = False - - #==================================================================# # Take settings from koboldai_vars and write them to client settings file #==================================================================# diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 10ef0e56..5eb8d60c 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -96,6 +96,38 @@ def prepare_4bit_load(modelpath): return result, groupsize +def load_model_gptq_settings(): + try: + js = json.loads(str(model.model_config).partition(' ')[2]) + except Exception as e: + try: + try: + js = json.load(open(utils.koboldai_vars.custmodpth + "/config.json", "r")) + except Exception as e: + js = json.load(open(utils.koboldai_vars.custmodpth.replace('/', '_') + "/config.json", "r")) + except Exception as e: + utils.koboldai_vars.gptq_model = False + return + + gptq_legacy_files = glob.glob(os.path.join(utils.koboldai_vars.custmodpth, "4bit*.pt")) + glob.glob(os.path.join(utils.koboldai_vars.custmodpth, "4bit*.safetensors")) + if "gptq_bits" in js: + utils.koboldai_vars.gptq_model = True + utils.koboldai_vars.gptq_bits = js["gptq_bits"] + utils.koboldai_vars.gptq_groupsize = js.get("gptq_groupsize", -1) + safetensors_file = os.path.join(utils.koboldai_vars.custmodpth, "model.safetensors") + pt_file = os.path.join(utils.koboldai_vars.custmodpth, "model.ckpt") + utils.koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file + elif gptq_legacy_files: + utils.koboldai_vars.gptq_model = True + utils.koboldai_vars.gptq_bits = 4 + utils.koboldai_vars.gptq_file = gptq_legacy_files[0] + fname = Path(utils.koboldai_vars.gptq_file).parts[-1] + g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) + utils.koboldai_vars.gptq_groupsize = int(g[0]) if g else -1 + else: + utils.koboldai_vars.gptq_model = False + + class HFTorch4BitInferenceModel(HFTorchInferenceModel): def _load(self, save_model: bool, initial_load: bool) -> None: utils.koboldai_vars.allowsp = True diff --git a/static/koboldai.js b/static/koboldai.js index 7918c3ff..cfc32d21 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -1680,7 +1680,7 @@ function selected_model_info(data) { document.getElementById("use_8_bit_div").classList.add("hidden"); document.getElementById("use_8_bit").checked = false; } - + //default URL loading if (data.default_url != null) { document.getElementById("modelurl").value = data.default_url; @@ -1849,6 +1849,8 @@ function selected_model_info(data) { accept.classList.remove("disabled"); } accept.disabled = false; + + } function update_gpu_layers() { @@ -3231,8 +3233,6 @@ function save_preset() { closePopups(); } - - //--------------------------------------------General UI Functions------------------------------------ function put_cursor_at_element(element) { var range = document.createRange(); @@ -7388,4 +7388,4 @@ $el("#gamescreen").addEventListener("paste", function(event) { false, event.clipboardData.getData("text/plain") ); -}); +}); \ No newline at end of file diff --git a/templates/popups.html b/templates/popups.html index e53b6276..d3310e66 100644 --- a/templates/popups.html +++ b/templates/popups.html @@ -75,10 +75,6 @@
Use 8 bit mode
- From 43b0afc7a85d2ae61d478cd258a8015d177660b2 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 5 May 2023 20:07:10 +0200 Subject: [PATCH 065/113] Add safe MPT support --- .gitmodules | 4 ++++ aiserver.py | 2 +- environments/huggingface.yml | 1 + environments/rocm.yml | 1 + modeling/inference_models/generic_hf_torch.py | 3 ++- modeling/inference_models/hf.py | 2 +- modeling/inference_models/hf_torch.py | 2 +- modeling/inference_models/hf_torch_4bit.py | 3 ++- repos/__init__.py | 1 + repos/hf_bleeding_edge | 1 + utils.py | 2 +- 11 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 repos/__init__.py create mode 160000 repos/hf_bleeding_edge diff --git a/.gitmodules b/.gitmodules index c6f4b308..4a1fb7c9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -8,3 +8,7 @@ path = repos/gptq url = https://github.com/0cc4m/GPTQ-for-LLaMa branch = a8303654c200c25577130466e5f9bc1e70fc8a50 +[submodule "repos/hf_bleeding_edge"] + path = repos/hf_bleeding_edge + url = https://github.com/0cc4m/hf_bleeding_edge + branch = b5d0b80c6947605b9ccf080fc17b68a516ea5857 diff --git a/aiserver.py b/aiserver.py index 80518450..bb6cc171 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1621,7 +1621,7 @@ def get_layer_count(model, directory=""): else: if(directory): model = directory - from transformers import AutoConfig + from repos.hf_bleeding_edge import AutoConfig if(os.path.isdir(model.replace('/', '_'))): model_config = AutoConfig.from_pretrained(model.replace('/', '_'), revision=koboldai_vars.revision, cache_dir="cache") elif(is_model_downloaded(model)): diff --git a/environments/huggingface.yml b/environments/huggingface.yml index e5fb939c..a179c468 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -49,3 +49,4 @@ dependencies: - diffusers - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - quant_cuda + - einops diff --git a/environments/rocm.yml b/environments/rocm.yml index 9358575d..d0daf4f2 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -44,3 +44,4 @@ dependencies: - diffusers - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - quant_rocm + - einops diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py index 9e30a7fd..61004db5 100644 --- a/modeling/inference_models/generic_hf_torch.py +++ b/modeling/inference_models/generic_hf_torch.py @@ -6,7 +6,8 @@ import torch import shutil from typing import Union -from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel +from transformers import GPTNeoForCausalLM, GPT2LMHeadModel +from repos.hf_bleeding_edge import AutoModelForCausalLM import utils import modeling.lazy_loader as lazy_loader diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 99e55be4..8c797940 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -1,6 +1,6 @@ import os from typing import Optional -from transformers import AutoConfig +from repos.hf_bleeding_edge import AutoConfig import utils import koboldai_settings diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 3cc28291..e0081c90 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -19,9 +19,9 @@ from transformers import ( StoppingCriteria, GPTNeoForCausalLM, GPT2LMHeadModel, - AutoModelForCausalLM, LogitsProcessorList, ) +from repos.hf_bleeding_edge import AutoModelForCausalLM import utils import modeling.lazy_loader as lazy_loader diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 5eb8d60c..75fb9ddf 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -9,7 +9,8 @@ import shutil import sys from typing import Union -from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer +from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer +from repos.hf_bleeding_edge import AutoModelForCausalLM import utils import modeling.lazy_loader as lazy_loader diff --git a/repos/__init__.py b/repos/__init__.py new file mode 100644 index 00000000..af438273 --- /dev/null +++ b/repos/__init__.py @@ -0,0 +1 @@ +from . import hf_bleeding_edge diff --git a/repos/hf_bleeding_edge b/repos/hf_bleeding_edge new file mode 160000 index 00000000..b5d0b80c --- /dev/null +++ b/repos/hf_bleeding_edge @@ -0,0 +1 @@ +Subproject commit b5d0b80c6947605b9ccf080fc17b68a516ea5857 diff --git a/utils.py b/utils.py index 13ebb6a3..89b9fb4f 100644 --- a/utils.py +++ b/utils.py @@ -184,7 +184,7 @@ def decodenewlines(txt): # Returns number of layers given an HF model config #==================================================================# def num_layers(config): - return config["n_layer"] if isinstance(config, dict) else config.num_layers if hasattr(config, "num_layers") else config.n_layer if hasattr(config, "n_layer") else config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else None + return config["n_layer"] if isinstance(config, dict) else config.num_layers if hasattr(config, "num_layers") else config.n_layer if hasattr(config, "n_layer") else config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else config.n_layers if hasattr(config, "n_layers") else None #==================================================================# # Downloads huggingface checkpoints using aria2c if possible From dedf2afeb3df922f164892ff3144d6d110f0dc43 Mon Sep 17 00:00:00 2001 From: Henk Date: Fri, 5 May 2023 19:50:56 +0200 Subject: [PATCH 066/113] More max_context_length flexibility --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index bb6cc171..791ae071 100644 --- a/aiserver.py +++ b/aiserver.py @@ -8302,7 +8302,7 @@ class GenerationInputSchema(SamplerSettingsSchema): use_userscripts: bool = fields.Boolean(load_default=False, metadata={"description": "Whether or not to use the userscripts from the KoboldAI GUI when generating text."}) soft_prompt: Optional[str] = fields.String(metadata={"description": "Soft prompt to use when generating. If set to the empty string or any other string containing no non-whitespace characters, uses no soft prompt."}, validate=[soft_prompt_validator, validate.Regexp(r"^[^/\\]*$")]) max_length: int = fields.Integer(validate=validate.Range(min=1, max=512), metadata={"description": "Number of tokens to generate."}) - max_context_length: int = fields.Integer(validate=validate.Range(min=512, max=2048), metadata={"description": "Maximum number of tokens to send to the model."}) + max_context_length: int = fields.Integer(validate=validate.Range(min=1), metadata={"description": "Maximum number of tokens to send to the model."}) n: int = fields.Integer(validate=validate.Range(min=1, max=5), metadata={"description": "Number of outputs to generate."}) disable_output_formatting: bool = fields.Boolean(load_default=True, metadata={"description": "When enabled, all output formatting options default to `false` instead of the value in the KoboldAI GUI."}) frmttriminc: Optional[bool] = fields.Boolean(metadata={"description": "Output formatting option. When enabled, removes some characters from the end of the output such that the output doesn't end in the middle of a sentence. If the output is less than one sentence long, does nothing.\n\nIf `disable_output_formatting` is `true`, this defaults to `false` instead of the value in the KoboldAI GUI."}) From 2f7856f0d1e1d153256f884248fd37432ed57279 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 6 May 2023 20:52:42 +0200 Subject: [PATCH 067/113] Use GPTQ python module, add MPT quantized support --- aiserver.py | 2 +- docs/gptq-whl-links.html | 6 +-- environments/huggingface.yml | 3 +- environments/rocm.yml | 3 +- install_requirements.sh | 4 +- koboldai_settings.py | 7 +++- modeling/inference_models/generic_hf_torch.py | 2 +- modeling/inference_models/hf.py | 2 +- modeling/inference_models/hf_torch.py | 2 +- modeling/inference_models/hf_torch_4bit.py | 38 ++++++------------- repos/__init__.py | 1 - repos/gptq | 1 - repos/hf_bleeding_edge | 1 - 13 files changed, 30 insertions(+), 42 deletions(-) delete mode 100644 repos/__init__.py delete mode 160000 repos/gptq delete mode 160000 repos/hf_bleeding_edge diff --git a/aiserver.py b/aiserver.py index 791ae071..11258fc1 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1621,7 +1621,7 @@ def get_layer_count(model, directory=""): else: if(directory): model = directory - from repos.hf_bleeding_edge import AutoConfig + from hf_bleeding_edge import AutoConfig if(os.path.isdir(model.replace('/', '_'))): model_config = AutoConfig.from_pretrained(model.replace('/', '_'), revision=koboldai_vars.revision, cache_dir="cache") elif(is_model_downloaded(model)): diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html index fed8b397..427185db 100644 --- a/docs/gptq-whl-links.html +++ b/docs/gptq-whl-links.html @@ -1,3 +1,3 @@ -quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl -quant_rocm-0.0.0-cp38-cp38-linux_x86_64.whl -quant_cuda-0.0.0-cp38-cp38-win_amd64.whl +gptq-0.0.1-cp38-cp38-linux_x86_64.whl +gptq_rocm-0.0.1-cp38-cp38-linux_x86_64.whl +gptq-0.0.1-cp38-cp38-win_amd64.whl diff --git a/environments/huggingface.yml b/environments/huggingface.yml index a179c468..f7fad2de 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -47,6 +47,7 @@ dependencies: - ftfy - pydub - diffusers + - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - quant_cuda + - gptq - einops diff --git a/environments/rocm.yml b/environments/rocm.yml index d0daf4f2..2b979d4c 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -42,6 +42,7 @@ dependencies: - ftfy - pydub - diffusers + - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - quant_rocm + - gptq_rocm - einops diff --git a/install_requirements.sh b/install_requirements.sh index 6e37c7e9..561b1b00 100755 --- a/install_requirements.sh +++ b/install_requirements.sh @@ -5,14 +5,14 @@ if [[ $1 = "cuda" || $1 = "CUDA" ]]; then wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster -bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y +# bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y exit fi if [[ $1 = "rocm" || $1 = "ROCM" ]]; then wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster -bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y +# bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y exit fi echo Please specify either CUDA or ROCM diff --git a/koboldai_settings.py b/koboldai_settings.py index d278dcc4..3e0fc48a 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1303,7 +1303,12 @@ class system_settings(settings): self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost self.experimental_features = False # Check if repos/gptq exists for 4-bit mode - self.bit_4_available = os.path.isdir("repos/gptq") + self.bit_4_available = True + try: + import gptq + except ImportError: + self.bit_4_available = False + self.seen_messages = [] self.git_repository = "" self.git_branch = "" diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py index 61004db5..78a4bf9f 100644 --- a/modeling/inference_models/generic_hf_torch.py +++ b/modeling/inference_models/generic_hf_torch.py @@ -7,7 +7,7 @@ import shutil from typing import Union from transformers import GPTNeoForCausalLM, GPT2LMHeadModel -from repos.hf_bleeding_edge import AutoModelForCausalLM +from hf_bleeding_edge import AutoModelForCausalLM import utils import modeling.lazy_loader as lazy_loader diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 8c797940..5ee2abaa 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -1,6 +1,6 @@ import os from typing import Optional -from repos.hf_bleeding_edge import AutoConfig +from hf_bleeding_edge import AutoConfig import utils import koboldai_settings diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index e0081c90..3339a75d 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -21,7 +21,7 @@ from transformers import ( GPT2LMHeadModel, LogitsProcessorList, ) -from repos.hf_bleeding_edge import AutoModelForCausalLM +from hf_bleeding_edge import AutoModelForCausalLM import utils import modeling.lazy_loader as lazy_loader diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 75fb9ddf..959d6258 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -10,7 +10,7 @@ import sys from typing import Union from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer -from repos.hf_bleeding_edge import AutoModelForCausalLM +from hf_bleeding_edge import AutoModelForCausalLM import utils import modeling.lazy_loader as lazy_loader @@ -28,14 +28,13 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel from modeling.tokenizer import GenericTokenizer # 4-bit dependencies +import gptq from pathlib import Path -sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) -from gptj import load_quant as gptj_load_quant -from gptneox import load_quant as gptneox_load_quant -from llama import load_quant as llama_load_quant -from opt import load_quant as opt_load_quant -from offload import load_quant_offload -monkey_patched_4bit = False +from gptq.gptj import load_quant as gptj_load_quant +from gptq.gptneox import load_quant as gptneox_load_quant +from gptq.llama import load_quant as llama_load_quant +from gptq.opt import load_quant as opt_load_quant +from gptq.offload import load_quant_offload def prepare_4bit_load(modelpath): @@ -63,9 +62,6 @@ def prepare_4bit_load(modelpath): groupsize = int(g[0]) break - global monkey_patched_4bit - - # Monkey-patch in old-format pt-file support if not result: print("4-bit file not found, falling back to old format.") for p in paths_4bit_old: @@ -78,28 +74,16 @@ def prepare_4bit_load(modelpath): print("4-bit old-format file not found, loading failed.") raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.") - import llama, opt, gptneox, gptj, old_quant - llama.make_quant = old_quant.old_make_quant - opt.make_quant = old_quant.old_make_quant - gptneox.make_quant = old_quant.old_make_quant - gptj.make_quant = old_quant.old_make_quant - monkey_patched_4bit = True - elif monkey_patched_4bit: - # Undo monkey patch - print("Undoing 4-bit old format monkey patch") - import llama, opt, gptneox, gptj, quant - llama.make_quant = quant.make_quant - opt.make_quant = quant.make_quant - gptneox.make_quant = quant.make_quant - gptj.make_quant = quant.make_quant - monkey_patched_4bit = False + gptq.modelutils.set_gptq_version(0) + else: + gptq.modelutils.set_gptq_version(1) return result, groupsize def load_model_gptq_settings(): try: - js = json.loads(str(model.model_config).partition(' ')[2]) + js = json.loads(str(model.model_config).partition(' ')[2]) except Exception as e: try: try: diff --git a/repos/__init__.py b/repos/__init__.py deleted file mode 100644 index af438273..00000000 --- a/repos/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import hf_bleeding_edge diff --git a/repos/gptq b/repos/gptq deleted file mode 160000 index 3c16fd9c..00000000 --- a/repos/gptq +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 3c16fd9c7946ebe85df8d951cb742adbc1966ec7 diff --git a/repos/hf_bleeding_edge b/repos/hf_bleeding_edge deleted file mode 160000 index b5d0b80c..00000000 --- a/repos/hf_bleeding_edge +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b5d0b80c6947605b9ccf080fc17b68a516ea5857 From a9fa199c49ee8e903d609f2cab394a87b8a87d24 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 6 May 2023 21:30:33 +0200 Subject: [PATCH 068/113] Rename gptq module, pull fix --- docs/gptq-whl-links.html | 6 +++--- environments/huggingface.yml | 2 +- environments/rocm.yml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html index 427185db..08cd0cd7 100644 --- a/docs/gptq-whl-links.html +++ b/docs/gptq-whl-links.html @@ -1,3 +1,3 @@ -gptq-0.0.1-cp38-cp38-linux_x86_64.whl -gptq_rocm-0.0.1-cp38-cp38-linux_x86_64.whl -gptq-0.0.1-cp38-cp38-win_amd64.whl +gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl +gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl +gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl diff --git a/environments/huggingface.yml b/environments/huggingface.yml index f7fad2de..12978b39 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -49,5 +49,5 @@ dependencies: - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - gptq + - gptq_koboldai==0.0.1 - einops diff --git a/environments/rocm.yml b/environments/rocm.yml index 2b979d4c..0cb44eb1 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -44,5 +44,5 @@ dependencies: - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - gptq_rocm + - gptq_koboldai_rocm==0.0.1 - einops From 9ec50c997280856dee810a74e18cd11fd5304228 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 6 May 2023 21:58:23 +0200 Subject: [PATCH 069/113] Fix 4-bit mpt --- modeling/inference_models/hf_torch_4bit.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 959d6258..8aaddcc1 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -34,6 +34,7 @@ from gptq.gptj import load_quant as gptj_load_quant from gptq.gptneox import load_quant as gptneox_load_quant from gptq.llama import load_quant as llama_load_quant from gptq.opt import load_quant as opt_load_quant +from gptq.mpt import load_quant as mpt_load_quant from gptq.offload import load_quant_offload @@ -369,6 +370,8 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "opt": model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) + elif utils.koboldai_vars.model_type == "mpt": + model = load_quant_offload(mpt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) else: raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit") From 51e6dcdcd4c1a69318f3818a7cb153f7221ad07f Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 7 May 2023 06:42:32 +0200 Subject: [PATCH 070/113] Revert accidental install_requirements change --- install_requirements.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install_requirements.sh b/install_requirements.sh index 561b1b00..6e37c7e9 100755 --- a/install_requirements.sh +++ b/install_requirements.sh @@ -5,14 +5,14 @@ if [[ $1 = "cuda" || $1 = "CUDA" ]]; then wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster -# bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y +bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y exit fi if [[ $1 = "rocm" || $1 = "ROCM" ]]; then wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster -# bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y +bin/micromamba create -f environments/rocm.yml -r runtime -n koboldai-rocm -y exit fi echo Please specify either CUDA or ROCM From 6b4d3218d62a35623a42e775d93b09da26f1aabc Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 7 May 2023 06:55:51 +0200 Subject: [PATCH 071/113] Fix OOM when loading large model split across GPUs --- modeling/inference_models/hf_torch_4bit.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 8aaddcc1..350cd761 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -139,10 +139,8 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): self.gpu_layers_list = [int(l) for l in gpulayers.split(",")] except ValueError: self.gpu_layers_list = [utils.num_layers(self.model_config)] - self.offload_4bit = sum(self.gpu_layers_list) < utils.num_layers(self.model_config) - if self.offload_4bit: - utils.koboldai_vars.lazy_load = False + if sum(self.gpu_layers_list) < utils.num_layers(self.model_config): print("4-bit CPU offloader active") tf_kwargs = { @@ -343,9 +341,6 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): self.patch_embedding() - if not self.offload_4bit: - self.model = self.model.half().to(utils.koboldai_vars.gpu_device) - self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() @@ -375,7 +370,7 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): else: raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit") - return model.half() if not self.offload_4bit else model + return model def _get_tokenizer(self, location: str): if utils.koboldai_vars.model_type == "llama": From e55a9d31c2e067ed42732dafddd6c67b696f3ceb Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 8 May 2023 22:55:59 +0200 Subject: [PATCH 072/113] Update readme, clean up gitmodules file --- .gitmodules | 8 -------- README.md | 6 ++---- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/.gitmodules b/.gitmodules index 4a1fb7c9..0107a8c3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,11 +4,3 @@ [submodule "KoboldAI-Horde-Bridge"] path = KoboldAI-Horde-Bridge url = https://github.com/db0/KoboldAI-Horde-Bridge -[submodule "repos/gptq"] - path = repos/gptq - url = https://github.com/0cc4m/GPTQ-for-LLaMa - branch = a8303654c200c25577130466e5f9bc1e70fc8a50 -[submodule "repos/hf_bleeding_edge"] - path = repos/hf_bleeding_edge - url = https://github.com/0cc4m/hf_bleeding_edge - branch = b5d0b80c6947605b9ccf080fc17b68a516ea5857 diff --git a/README.md b/README.md index aadfd345..517c00e8 100644 --- a/README.md +++ b/README.md @@ -42,16 +42,14 @@ Put your 4bit quantized .pt or .safetensors in that folder with all associated . Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-g.pt` or `4bit-.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`) -So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below). +So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, it'll load the quantized model. #### Running KoboldAI and loading 4bit models If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux) Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD] -Switch to UI2, then load your model and be sure 4-bit toggle is on. - -The 4bit toggle shows when a model to load is selected. +Switch to UI2, then load your model. ## KoboldAI - Your gateway to GPT writing From 4f94247910c1785b4fa15dc5eb81d664978a3f91 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 8 May 2023 22:56:17 +0200 Subject: [PATCH 073/113] Fix chat mode empty generation error --- utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils.py b/utils.py index 89b9fb4f..54083339 100644 --- a/utils.py +++ b/utils.py @@ -714,7 +714,7 @@ def applyoutputformatting(txt, no_sentence_trimming=False, no_single_line=False) txt = replaceblanklines(txt) # trim off starting new lines in replies if we're in chat mode - if koboldai_vars.chatmode and txt[0] == "\n": + if koboldai_vars.chatmode and txt and txt[0] == "\n": txt = txt[1:] # Remove special characters From 61215981424ce8abba076e687da0e60149b655ea Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 8 May 2023 22:57:09 +0200 Subject: [PATCH 074/113] Fix multigpu loading without lazy-loader --- modeling/inference_models/generic_hf_torch.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py index 78a4bf9f..2772503b 100644 --- a/modeling/inference_models/generic_hf_torch.py +++ b/modeling/inference_models/generic_hf_torch.py @@ -243,6 +243,11 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): ) shutil.rmtree("cache/") + if not self.lazy_load: + utils.layers_module_names = utils.get_layers_module_names(self.model) + utils.module_names = list(self.model.state_dict().keys()) + utils.named_buffers = list(self.model.named_buffers(recurse=True)) + self.patch_embedding() if utils.koboldai_vars.hascuda: From a2d01bb9e454a0c951fc9c4c3e67599bcf188b5b Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 9 May 2023 22:19:18 +0200 Subject: [PATCH 075/113] Update to GPTQ module 0.0.2, add support for upstream cuda quantizations, automatic detection --- docs/gptq-whl-links.html | 4 ++ environments/huggingface.yml | 2 +- environments/rocm.yml | 6 +-- koboldai_settings.py | 1 + modeling/inference_models/hf.py | 3 +- modeling/inference_models/hf_torch_4bit.py | 59 +++++++++++++--------- 6 files changed, 46 insertions(+), 29 deletions(-) diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html index 08cd0cd7..64d15d3d 100644 --- a/docs/gptq-whl-links.html +++ b/docs/gptq-whl-links.html @@ -1,3 +1,7 @@ gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl + +gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl +gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl +gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 12978b39..c381ea94 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -49,5 +49,5 @@ dependencies: - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - gptq_koboldai==0.0.1 + - gptq_koboldai==0.0.2 - einops diff --git a/environments/rocm.yml b/environments/rocm.yml index 0cb44eb1..4f6cfa11 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -24,8 +24,8 @@ dependencies: - Pillow - psutil - pip: - - --extra-index-url https://download.pytorch.org/whl/rocm5.4.2 - - torch==2.0.0+rocm5.4.2 + - --extra-index-url https://download.pytorch.org/whl/rocm5.2 + - torch==1.13.1+rocm5.2 - flask-cloudflared==0.0.10 - flask-ngrok - flask-cors @@ -44,5 +44,5 @@ dependencies: - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - gptq_koboldai_rocm==0.0.1 + - gptq_koboldai_rocm==0.0.2 - einops diff --git a/koboldai_settings.py b/koboldai_settings.py index 3e0fc48a..f0df2162 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -925,6 +925,7 @@ class story_settings(settings): self.gptq_model = False self.gptq_bits = -1 self.gptq_groupsize = -1 + self.gptq_version = -1 self.gptq_file = None self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled")) diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 5ee2abaa..7050f34e 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -181,7 +181,8 @@ class HFInferenceModel(InferenceModel): if "gptq_bits" in dir(self.model_config): utils.koboldai_vars.gptq_model = True utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits - utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize + utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1 + utils.koboldai_vars.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1 utils.koboldai_vars.gptq_file = None else: utils.koboldai_vars.gptq_model = False diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 350cd761..5917a43e 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -48,8 +48,7 @@ def prepare_4bit_load(modelpath): return path_4bit, False # Legacy format support - paths_4bit = ["4bit*.safetensors", "4bit*.pt"] - paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] + paths_4bit = ["4bit*.safetensors", "4bit*.pt", "4bit-old.safetensors", "4bit-old.pt"] result = False groupsize = -1 for p in paths_4bit: @@ -59,26 +58,11 @@ def prepare_4bit_load(modelpath): result = val[0] fname = Path(result).parts[-1] g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) + groupsize = -1 if g: groupsize = int(g[0]) break - if not result: - print("4-bit file not found, falling back to old format.") - for p in paths_4bit_old: - p = os.path.join(modelpath, p) - if os.path.isfile(p): - result = p - break - - if not result: - print("4-bit old-format file not found, loading failed.") - raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.") - - gptq.modelutils.set_gptq_version(0) - else: - gptq.modelutils.set_gptq_version(1) - return result, groupsize @@ -103,6 +87,7 @@ def load_model_gptq_settings(): safetensors_file = os.path.join(utils.koboldai_vars.custmodpth, "model.safetensors") pt_file = os.path.join(utils.koboldai_vars.custmodpth, "model.ckpt") utils.koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file + utils.koboldai_vars.gptq_version = js.get("gptq_version", -1) elif gptq_legacy_files: utils.koboldai_vars.gptq_model = True utils.koboldai_vars.gptq_bits = 4 @@ -110,10 +95,37 @@ def load_model_gptq_settings(): fname = Path(utils.koboldai_vars.gptq_file).parts[-1] g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) utils.koboldai_vars.gptq_groupsize = int(g[0]) if g else -1 + utils.koboldai_vars.gptq_version = -1 else: utils.koboldai_vars.gptq_model = False +def get_gptq_version(fpath): + v1_strings = ["zeros", "scales", "bias", "qweight"] + v2_strings = ["qzeros", "scales", "bias", "qweight"] + v3_strings = ["qzeros", "scales", "g_idx", "qweight"] + + with open(fpath, "rb") as f: + data = str(f.read(1024*1024)) + + v0 = all([s in data for s in v1_strings]) and not "qzeros" in data + v1 = all([s in data for s in v2_strings]) + v2 = all([s in data for s in v3_strings]) + + if v2: + if v0 or v1: + logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}") + return 2 + if v1: + if v0 or v2: + logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}") + return 1 + if v0: + if v1 or v2: + logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}") + return 0 + + class HFTorch4BitInferenceModel(HFTorchInferenceModel): def _load(self, save_model: bool, initial_load: bool) -> None: utils.koboldai_vars.allowsp = True @@ -140,9 +152,6 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): except ValueError: self.gpu_layers_list = [utils.num_layers(self.model_config)] - if sum(self.gpu_layers_list) < utils.num_layers(self.model_config): - print("4-bit CPU offloader active") - tf_kwargs = { "low_cpu_mem_usage": True, } @@ -351,12 +360,14 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth) + if utils.koboldai_vars.gptq_version < 0: + utils.koboldai_vars.gptq_version = get_gptq_version(path_4bit) + gptq.modelutils.set_gptq_version(utils.koboldai_vars.gptq_version) + if legacy_groupsize is not False: groupsize = legacy_groupsize - print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}") - - print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit") + logger.info(f"Using 4-bit file: {path_4bit}, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}") if utils.koboldai_vars.model_type == "gptj": model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "gpt_neox": From 266c0574f671e3038b75ee1d396c761d095f3592 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 13 May 2023 20:15:11 +0200 Subject: [PATCH 076/113] Fix 4bit pt loading, add traceback output to GPT2 fallback --- modeling/inference_models/hf_torch.py | 5 +++-- modeling/lazy_loader.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 3339a75d..dfb9d5f9 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -291,7 +291,7 @@ class HFTorchInferenceModel(HFInferenceModel): logger.error("Invalid load key! Aborting.") raise - logger.warning(f"Fell back to GPT2LMHeadModel due to {e}") + logger.warning(f"Fell back to GPT2LMHeadModel due to {traceback.format_exc()}") try: return GPT2LMHeadModel.from_pretrained(location, **tf_kwargs) except Exception as e: @@ -538,7 +538,8 @@ class HFTorchInferenceModel(HFInferenceModel): try: f = z.open(f"archive/data/{storage_key}") except: - f = z.open(f"{zipfolder}/data/{storage_key}") + ziproot = z.namelist()[0].split("/")[0] + f = z.open(f"{ziproot}/data/{storage_key}") current_offset = 0 if current_offset != model_dict[key].seek_offset: f.read(model_dict[key].seek_offset - current_offset) diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 3dee5bae..14ece404 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -141,7 +141,8 @@ class TorchLazyTensor(LazyTensor): try: f = checkpoint.open(f"archive/data/{self.key}", "r") except: - f = checkpoint.open(f"{filename}/data/{self.key}", "r") + ziproot = z.namelist()[0].split("/")[0] + f = z.open(f"{ziproot}/data/{self.key}", "r") f.read(self.seek_offset) else: f = checkpoint From 7f7b350741ebeb7e9157a240846740a845d077e6 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 13 May 2023 20:31:01 +0200 Subject: [PATCH 077/113] Catch further error during multigpu 4bit setup --- modeling/inference_models/hf_torch_4bit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 5917a43e..2fd4cb89 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -149,7 +149,7 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): try: self.gpu_layers_list = [int(l) for l in gpulayers.split(",")] - except ValueError: + except (ValueError, AttributeError): self.gpu_layers_list = [utils.num_layers(self.model_config)] tf_kwargs = { From 3d4d5df76bfc2e6c832c3e8f174f77a23557cf02 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 13 May 2023 20:33:13 +0200 Subject: [PATCH 078/113] Remove rocm wheel, because it didn't work correctly --- README.md | 3 +-- environments/rocm.yml | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index 517c00e8..5f4bc5c7 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,7 @@ Next step, (Windows) subfolder mode or B: option doesn't matter choose either ``` ./install_requirements.sh rocm ./commandline-rocm.sh - cd repos/gptq - python setup_cuda.py install + pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa@c884b421a233f9603d8224c9b22c2d83dd2c1fc4 ``` * If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed * If you get CUDA_HOME envar is not set run in env: diff --git a/environments/rocm.yml b/environments/rocm.yml index 4f6cfa11..4e53a821 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -43,6 +43,4 @@ dependencies: - pydub - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - gptq_koboldai_rocm==0.0.2 - einops From 2c18d9f2b5dba9caad378f3ed04f84d408720e36 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 18 May 2023 21:51:03 +0200 Subject: [PATCH 079/113] Update GPTQ module to 0.0.3 --- docs/gptq-whl-links.html | 3 +++ environments/huggingface.yml | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html index 64d15d3d..750c0746 100644 --- a/docs/gptq-whl-links.html +++ b/docs/gptq-whl-links.html @@ -5,3 +5,6 @@ gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl + +gptq_koboldai-0.0.3-cp38-cp38-linux_x86_64.whl +gptq_koboldai-0.0.3-cp38-cp38-win_amd64.whl diff --git a/environments/huggingface.yml b/environments/huggingface.yml index c381ea94..e4aac1ed 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -49,5 +49,5 @@ dependencies: - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - gptq_koboldai==0.0.2 + - gptq_koboldai==0.0.3 - einops From d5eac13d9f76484d991e33e0cc3a487fc5119937 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 19 May 2023 18:22:26 +0200 Subject: [PATCH 080/113] Fix 2, 3 and 8-bit loading --- modeling/inference_models/hf_torch_4bit.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 2fd4cb89..580fa306 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -367,17 +367,17 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): if legacy_groupsize is not False: groupsize = legacy_groupsize - logger.info(f"Using 4-bit file: {path_4bit}, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}") + logger.info(f"Using GPTQ file: {path_4bit}, {utils.koboldai_vars.gptq_bits}-bit model, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}") if utils.koboldai_vars.model_type == "gptj": - model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) + model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "gpt_neox": - model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) + model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "llama": - model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) + model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "opt": - model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) + model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "mpt": - model = load_quant_offload(mpt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) + model = load_quant_offload(mpt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) else: raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit") From c32932998dda6ababec9687e0d4970a6f0f70922 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 19 May 2023 21:51:38 +0200 Subject: [PATCH 081/113] Update GPTQ module to 0.0.4 --- docs/gptq-whl-links.html | 3 +++ environments/huggingface.yml | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html index 750c0746..34d05691 100644 --- a/docs/gptq-whl-links.html +++ b/docs/gptq-whl-links.html @@ -8,3 +8,6 @@ gptq_koboldai-0.0.3-cp38-cp38-linux_x86_64.whl gptq_koboldai-0.0.3-cp38-cp38-win_amd64.whl + +gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl +gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl diff --git a/environments/huggingface.yml b/environments/huggingface.yml index e4aac1ed..c7d03ad0 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -49,5 +49,5 @@ dependencies: - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - gptq_koboldai==0.0.3 + - gptq_koboldai==0.0.4 - einops From e49d35afc935f3a52155a0bc9f9d200a84e1ad41 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 28 May 2023 22:54:36 +0200 Subject: [PATCH 082/113] Add 4bit plugin --- aiserver.py | 1 - .../inference_models/4bit_hf_torch/class.py | 227 ++++++++++ .../generic_hf_torch/class.py | 13 +- modeling/inference_models/hf.py | 4 - modeling/inference_models/hf_torch_4bit.py | 392 ------------------ 5 files changed, 233 insertions(+), 404 deletions(-) create mode 100644 modeling/inference_models/4bit_hf_torch/class.py delete mode 100644 modeling/inference_models/hf_torch_4bit.py diff --git a/aiserver.py b/aiserver.py index c28633d6..3c574431 100644 --- a/aiserver.py +++ b/aiserver.py @@ -623,7 +623,6 @@ utils.socketio = socketio # Weird import position to steal koboldai_vars from utils from modeling.patches import patch_transformers -from modeling.inference_models.hf_torch_4bit import load_model_gptq_settings #Load all of the model importers import importlib diff --git a/modeling/inference_models/4bit_hf_torch/class.py b/modeling/inference_models/4bit_hf_torch/class.py new file mode 100644 index 00000000..62f04bfb --- /dev/null +++ b/modeling/inference_models/4bit_hf_torch/class.py @@ -0,0 +1,227 @@ +from __future__ import annotations + +import os +import glob +import json +import torch +import re +import shutil +import sys +from typing import Union + +from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer +from hf_bleeding_edge import AutoModelForCausalLM + +import utils +import modeling.lazy_loader as lazy_loader +import koboldai_settings +from logger import logger, set_logger_verbosity + +try: + import breakmodel +except ModuleNotFoundError as e: + # Breakmodel is only expected to work on GPU + if not utils.koboldai_vars.use_colab_tpu: + raise e + +from modeling.inference_models.hf_torch import HFTorchInferenceModel +from modeling.tokenizer import GenericTokenizer + +# 4-bit dependencies +import gptq +from pathlib import Path +from gptq.gptj import load_quant as gptj_load_quant +from gptq.gptneox import load_quant as gptneox_load_quant +from gptq.llama import load_quant as llama_load_quant +from gptq.opt import load_quant as opt_load_quant +from gptq.mpt import load_quant as mpt_load_quant +from gptq.offload import load_quant_offload + + +model_backend_name = "Huggingface GPTQ" + + +def load_model_gptq_settings(path): + try: + js = json.load(open(path + "/config.json", "r")) + except Exception as e: + return False, -1, -1, False, -1 + + gptq_model = False + gptq_bits = -1 + gptq_groupsize = -1 + gptq_file = False + gptq_version = -1 + + gptq_legacy_files = glob.glob(os.path.join(path, "4bit*.pt")) + glob.glob(os.path.join(path, "4bit*.safetensors")) + if "gptq_bits" in js: + gptq_model = True + gptq_bits = js["gptq_bits"] + gptq_groupsize = js.get("gptq_groupsize", -1) + safetensors_file = os.path.join(path, "model.safetensors") + pt_file = os.path.join(path, "model.ckpt") + gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file + gptq_version = js.get("gptq_version", -1) + elif gptq_legacy_files: + gptq_model = True + gptq_bits = 4 + gptq_file = gptq_legacy_files[0] + fname = Path(gptq_file).parts[-1] + g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) + gptq_groupsize = int(g[0]) if g else -1 + gptq_version = -1 + + return gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version + + +def get_gptq_version(fpath): + v1_strings = ["zeros", "scales", "bias", "qweight"] + v2_strings = ["qzeros", "scales", "bias", "qweight"] + v3_strings = ["qzeros", "scales", "g_idx", "qweight"] + + with open(fpath, "rb") as f: + data = str(f.read(1024*1024)) + + v0 = all([s in data for s in v1_strings]) and not "qzeros" in data + v1 = all([s in data for s in v2_strings]) + v2 = all([s in data for s in v3_strings]) + + if v2: + if v0 or v1: + logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}") + return 2 + if v1: + if v0 or v2: + logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}") + return 1 + if v0: + if v1 or v2: + logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}") + return 0 + + +class model_backend(HFTorchInferenceModel): + def is_valid(self, model_name, model_path, menu_path): + gptq_model, _, _, _, _ = load_model_gptq_settings(model_path) + return gptq_model + + def _load(self, save_model: bool, initial_load: bool) -> None: + # Make model path the same as the model name to make this consistent + # with the other loading method if it isn't a known model type. This + # code is not just a workaround for below, it is also used to make the + # behavior consistent with other loading methods - Henk717 + # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]: + # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model + + self.init_model_config() + + self.lazy_load = False + + gpulayers = breakmodel.gpu_blocks + + try: + self.gpu_layers_list = [int(l) for l in gpulayers.split(",")] + except (ValueError, AttributeError): + self.gpu_layers_list = [utils.num_layers(self.model_config)] + + tf_kwargs = { + "low_cpu_mem_usage": True, + } + + # If we're using torch_lazy_loader, we need to get breakmodel config + # early so that it knows where to load the individual model tensors + logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel)) + if ( + self.lazy_load + and utils.koboldai_vars.hascuda + and utils.koboldai_vars.breakmodel + and not utils.koboldai_vars.nobreakmodel + ): + self.breakmodel_device_config(self.model_config) + + if self.lazy_load: + # If we're using lazy loader, we need to figure out what the model's hidden layers are called + with lazy_loader.use_lazy_load( + dematerialized_modules=True, use_accelerate_init_empty_weights=True + ): + try: + metamodel = AutoModelForCausalLM.from_config(self.model_config) + utils.layers_module_names = utils.get_layers_module_names(metamodel) + utils.module_names = list(metamodel.state_dict().keys()) + utils.named_buffers = list(metamodel.named_buffers(recurse=True)) + except Exception as e: + logger.warning(f"Gave up on lazy loading due to {e}") + self.lazy_load = False + + # Download model from Huggingface if it does not exist, otherwise load locally + with self._maybe_use_float16(), lazy_loader.use_lazy_load( + enable=self.lazy_load, + callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) + if self.lazy_load + else None, + dematerialized_modules=True, + ): + if self.lazy_load: + # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time + tf_kwargs.pop("low_cpu_mem_usage", None) + + if self.get_local_model_path(): + # Model is stored locally, load it. + self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + else: + raise NotImplementedError("GPTQ Model downloading not implemented") + + if not self.lazy_load: + utils.layers_module_names = utils.get_layers_module_names(self.model) + utils.module_names = list(self.model.state_dict().keys()) + utils.named_buffers = list(self.model.named_buffers(recurse=True)) + + if ( + utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default + and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj") + ): + utils.koboldai_vars.badwordsids = [ + [v] + for k, v in self.tokenizer.get_vocab().items() + if any(c in str(k) for c in "[]") + ] + + self.patch_embedding() + + self.model.kai_model = self + utils.koboldai_vars.modeldim = self.get_hidden_size() + + def _get_model(self, location: str, tf_kwargs: Dict): + gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location) + + if gptq_version < 0: + gptq_version = get_gptq_version(gptq_file) + gptq.modelutils.set_gptq_version(gptq_version) + + model_type = self.get_model_type() + + logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}, groupsize {gptq_groupsize}") + if model_type == "gptj": + model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + elif model_type == "gpt_neox": + model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + elif model_type == "llama": + model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + elif model_type == "opt": + model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + elif model_type == "mpt": + model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + else: + raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") + + return model + + def _get_tokenizer(self, location: str): + model_type = self.get_model_type() + if model_type == "llama": + tokenizer = LlamaTokenizer.from_pretrained(location) + else: + tokenizer = AutoTokenizer.from_pretrained(location) + + return GenericTokenizer(tokenizer) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 50d7503c..93bc08ea 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -25,8 +25,12 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel model_backend_name = "Huggingface" -class GenericHFTorchInferenceModel(HFTorchInferenceModel): - def load_config(self) -> None: +class model_backend(HFTorchInferenceModel): + + def _initialize_model(self): + return + + def _load(self, save_model: bool, initial_load: bool) -> None: utils.koboldai_vars.allowsp = True # Make model path the same as the model name to make this consistent @@ -243,11 +247,6 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): ) shutil.rmtree("cache/") - if not self.lazy_load: - utils.layers_module_names = utils.get_layers_module_names(self.model) - utils.module_names = list(self.model.state_dict().keys()) - utils.named_buffers = list(self.model.named_buffers(recurse=True)) - self.patch_embedding() diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 93e1757a..dc34636a 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -1,12 +1,8 @@ import os, sys from typing import Optional -<<<<<<< HEAD from hf_bleeding_edge import AutoConfig -======= -from transformers import AutoConfig import warnings ->>>>>>> ebolam/Model_Plugins import utils import json import koboldai_settings diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py deleted file mode 100644 index 580fa306..00000000 --- a/modeling/inference_models/hf_torch_4bit.py +++ /dev/null @@ -1,392 +0,0 @@ -from __future__ import annotations - -import os -import glob -import json -import torch -import re -import shutil -import sys -from typing import Union - -from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer -from hf_bleeding_edge import AutoModelForCausalLM - -import utils -import modeling.lazy_loader as lazy_loader -import koboldai_settings -from logger import logger, set_logger_verbosity - -try: - import breakmodel -except ModuleNotFoundError as e: - # Breakmodel is only expected to work on GPU - if not utils.koboldai_vars.use_colab_tpu: - raise e - -from modeling.inference_models.hf_torch import HFTorchInferenceModel -from modeling.tokenizer import GenericTokenizer - -# 4-bit dependencies -import gptq -from pathlib import Path -from gptq.gptj import load_quant as gptj_load_quant -from gptq.gptneox import load_quant as gptneox_load_quant -from gptq.llama import load_quant as llama_load_quant -from gptq.opt import load_quant as opt_load_quant -from gptq.mpt import load_quant as mpt_load_quant -from gptq.offload import load_quant_offload - - -def prepare_4bit_load(modelpath): - path_4bit = os.path.join(modelpath, "model.safetensors") - if os.path.isfile(path_4bit): - return path_4bit, False - - path_4bit = os.path.join(modelpath, "model.ckpt") - if os.path.isfile(path_4bit): - return path_4bit, False - - # Legacy format support - paths_4bit = ["4bit*.safetensors", "4bit*.pt", "4bit-old.safetensors", "4bit-old.pt"] - result = False - groupsize = -1 - for p in paths_4bit: - p = os.path.join(modelpath, p) - val = [v for v in glob.glob(p) if "4bit-old" not in v] - if val: - result = val[0] - fname = Path(result).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) - groupsize = -1 - if g: - groupsize = int(g[0]) - break - - return result, groupsize - - -def load_model_gptq_settings(): - try: - js = json.loads(str(model.model_config).partition(' ')[2]) - except Exception as e: - try: - try: - js = json.load(open(utils.koboldai_vars.custmodpth + "/config.json", "r")) - except Exception as e: - js = json.load(open(utils.koboldai_vars.custmodpth.replace('/', '_') + "/config.json", "r")) - except Exception as e: - utils.koboldai_vars.gptq_model = False - return - - gptq_legacy_files = glob.glob(os.path.join(utils.koboldai_vars.custmodpth, "4bit*.pt")) + glob.glob(os.path.join(utils.koboldai_vars.custmodpth, "4bit*.safetensors")) - if "gptq_bits" in js: - utils.koboldai_vars.gptq_model = True - utils.koboldai_vars.gptq_bits = js["gptq_bits"] - utils.koboldai_vars.gptq_groupsize = js.get("gptq_groupsize", -1) - safetensors_file = os.path.join(utils.koboldai_vars.custmodpth, "model.safetensors") - pt_file = os.path.join(utils.koboldai_vars.custmodpth, "model.ckpt") - utils.koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file - utils.koboldai_vars.gptq_version = js.get("gptq_version", -1) - elif gptq_legacy_files: - utils.koboldai_vars.gptq_model = True - utils.koboldai_vars.gptq_bits = 4 - utils.koboldai_vars.gptq_file = gptq_legacy_files[0] - fname = Path(utils.koboldai_vars.gptq_file).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) - utils.koboldai_vars.gptq_groupsize = int(g[0]) if g else -1 - utils.koboldai_vars.gptq_version = -1 - else: - utils.koboldai_vars.gptq_model = False - - -def get_gptq_version(fpath): - v1_strings = ["zeros", "scales", "bias", "qweight"] - v2_strings = ["qzeros", "scales", "bias", "qweight"] - v3_strings = ["qzeros", "scales", "g_idx", "qweight"] - - with open(fpath, "rb") as f: - data = str(f.read(1024*1024)) - - v0 = all([s in data for s in v1_strings]) and not "qzeros" in data - v1 = all([s in data for s in v2_strings]) - v2 = all([s in data for s in v3_strings]) - - if v2: - if v0 or v1: - logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}") - return 2 - if v1: - if v0 or v2: - logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}") - return 1 - if v0: - if v1 or v2: - logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}") - return 0 - - -class HFTorch4BitInferenceModel(HFTorchInferenceModel): - def _load(self, save_model: bool, initial_load: bool) -> None: - utils.koboldai_vars.allowsp = True - - # Make model path the same as the model name to make this consistent - # with the other loading method if it isn't a known model type. This - # code is not just a workaround for below, it is also used to make the - # behavior consistent with other loading methods - Henk717 - # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]: - # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model - - if self.model_name == "NeoCustom": - self.model_name = os.path.basename( - os.path.normpath(utils.koboldai_vars.custmodpth) - ) - utils.koboldai_vars.model = self.model_name - - self.init_model_config() - - gpulayers = utils.args.breakmodel_gpulayers - - try: - self.gpu_layers_list = [int(l) for l in gpulayers.split(",")] - except (ValueError, AttributeError): - self.gpu_layers_list = [utils.num_layers(self.model_config)] - - tf_kwargs = { - "low_cpu_mem_usage": True, - } - - # If we're using torch_lazy_loader, we need to get breakmodel config - # early so that it knows where to load the individual model tensors - if ( - self.lazy_load - and utils.koboldai_vars.hascuda - and utils.koboldai_vars.breakmodel - and not utils.koboldai_vars.nobreakmodel - ): - self.breakmodel_device_config(self.model_config) - - if self.lazy_load: - # If we're using lazy loader, we need to figure out what the model's hidden layers are called - with lazy_loader.use_lazy_load( - dematerialized_modules=True, use_accelerate_init_empty_weights=True - ): - try: - metamodel = AutoModelForCausalLM.from_config(self.model_config) - utils.layers_module_names = utils.get_layers_module_names(metamodel) - utils.module_names = list(metamodel.state_dict().keys()) - utils.named_buffers = list(metamodel.named_buffers(recurse=True)) - except Exception as e: - logger.warning(f"Gave up on lazy loading due to {e}") - self.lazy_load = False - - # Download model from Huggingface if it does not exist, otherwise load locally - with self._maybe_use_float16(), lazy_loader.use_lazy_load( - enable=self.lazy_load, - callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) - if self.lazy_load - else None, - dematerialized_modules=True, - ): - if self.lazy_load: - # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time - tf_kwargs.pop("low_cpu_mem_usage", None) - - if self.get_local_model_path(): - # Model is stored locally, load it. - self.model = self._get_model(self.get_local_model_path(), tf_kwargs) - self.tokenizer = self._get_tokenizer(self.get_local_model_path()) - else: - # Model not stored locally, we need to download it. - - # _rebuild_tensor patch for casting dtype and supporting LazyTensors - old_rebuild_tensor = torch._utils._rebuild_tensor - - def new_rebuild_tensor( - storage: Union[lazy_loader.LazyTensor, torch.Storage], - storage_offset, - shape, - stride, - ): - if not isinstance(storage, lazy_loader.LazyTensor): - dtype = storage.dtype - else: - dtype = storage.storage_type.dtype - if not isinstance(dtype, torch.dtype): - dtype = storage.storage_type(0).dtype - if dtype is torch.float32 and len(shape) >= 2: - utils.koboldai_vars.fp32_model = True - return old_rebuild_tensor(storage, storage_offset, shape, stride) - - torch._utils._rebuild_tensor = new_rebuild_tensor - self.model = self._get_model(self.model_name, tf_kwargs) - self.tokenizer = self._get_tokenizer(self.model_name) - torch._utils._rebuild_tensor = old_rebuild_tensor - - if save_model: - self.tokenizer.save_pretrained( - self.get_local_model_path(ignore_existance=True) - ) - - if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks: - # Use save_pretrained to convert fp32 models to fp16, - # unless we are using disk cache because save_pretrained - # is not supported in that case - self.model = self.model.half() - self.model.save_pretrained( - self.get_local_model_path(ignore_existance=True), - max_shard_size="500MiB", - ) - - else: - # For fp16 models, we can just copy the model files directly - import transformers.configuration_utils - import transformers.modeling_utils - import transformers.file_utils - import huggingface_hub - - # Save the config.json - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - self.model_name, - transformers.configuration_utils.CONFIG_NAME, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=False, - ) - ), - os.path.join( - self.get_local_model_path(ignore_existance=True), - transformers.configuration_utils.CONFIG_NAME, - ), - ) - - if utils.num_shards is None: - # Save the pytorch_model.bin or model.safetensors of an unsharded model - any_success = False - possible_checkpoint_names = [ - transformers.modeling_utils.WEIGHTS_NAME, - "model.safetensors", - ] - - for possible_checkpoint_name in possible_checkpoint_names: - try: - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - self.model_name, - possible_checkpoint_name, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=False, - ) - ), - os.path.join( - self.get_local_model_path( - ignore_existance=True - ), - possible_checkpoint_name, - ), - ) - any_success = True - except Exception: - pass - - if not any_success: - raise RuntimeError(f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'") - else: - # Handle saving sharded models - - with open(utils.from_pretrained_index_filename) as f: - map_data = json.load(f) - filenames = set(map_data["weight_map"].values()) - # Save the pytorch_model.bin.index.json of a sharded model - shutil.move( - os.path.realpath(utils.from_pretrained_index_filename), - os.path.join( - self.get_local_model_path(ignore_existance=True), - transformers.modeling_utils.WEIGHTS_INDEX_NAME, - ), - ) - # Then save the pytorch_model-#####-of-#####.bin files - for filename in filenames: - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - self.model_name, - filename, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=False, - ) - ), - os.path.join( - self.get_local_model_path( - ignore_existance=True - ), - filename, - ), - ) - shutil.rmtree("cache/") - - if not self.lazy_load: - utils.layers_module_names = utils.get_layers_module_names(self.model) - utils.module_names = list(self.model.state_dict().keys()) - utils.named_buffers = list(self.model.named_buffers(recurse=True)) - - if ( - utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default - and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj") - ): - utils.koboldai_vars.badwordsids = [ - [v] - for k, v in self.tokenizer.get_vocab().items() - if any(c in str(k) for c in "[]") - ] - - self.patch_embedding() - - self.model.kai_model = self - utils.koboldai_vars.modeldim = self.get_hidden_size() - - def _get_model(self, location: str, tf_kwargs: Dict): - if not utils.koboldai_vars.custmodpth: - pass - groupsize = utils.koboldai_vars.gptq_groupsize - - path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth) - - if utils.koboldai_vars.gptq_version < 0: - utils.koboldai_vars.gptq_version = get_gptq_version(path_4bit) - gptq.modelutils.set_gptq_version(utils.koboldai_vars.gptq_version) - - if legacy_groupsize is not False: - groupsize = legacy_groupsize - - logger.info(f"Using GPTQ file: {path_4bit}, {utils.koboldai_vars.gptq_bits}-bit model, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}") - if utils.koboldai_vars.model_type == "gptj": - model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) - elif utils.koboldai_vars.model_type == "gpt_neox": - model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) - elif utils.koboldai_vars.model_type == "llama": - model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) - elif utils.koboldai_vars.model_type == "opt": - model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) - elif utils.koboldai_vars.model_type == "mpt": - model = load_quant_offload(mpt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) - else: - raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit") - - return model - - def _get_tokenizer(self, location: str): - if utils.koboldai_vars.model_type == "llama": - tokenizer = LlamaTokenizer.from_pretrained(utils.koboldai_vars.custmodpth) - else: - tokenizer = AutoTokenizer.from_pretrained(utils.koboldai_vars.custmodpth) - - return GenericTokenizer(tokenizer) From cf886de18b0a4d653a3f78b4dadaf390536fa322 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 30 May 2023 19:15:20 +0200 Subject: [PATCH 083/113] Remove leftover values fro koboldai_settings.py --- koboldai_settings.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/koboldai_settings.py b/koboldai_settings.py index ae8d33cc..cd8fdafa 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -920,13 +920,6 @@ class story_settings(settings): self.commentary_chance = 0 self.commentary_enabled = False - # 4bit model vals - self.gptq_model = False - self.gptq_bits = -1 - self.gptq_groupsize = -1 - self.gptq_version = -1 - self.gptq_file = None - self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled")) ################### must be at bottom ######################### From b7838c7dde202502369c2461834076adfc4e22a3 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 4 Jun 2023 08:06:48 +0200 Subject: [PATCH 084/113] Fall back to autogptq if available and model not supported by gptq-koboldai --- .../inference_models/4bit_hf_torch/class.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/modeling/inference_models/4bit_hf_torch/class.py b/modeling/inference_models/4bit_hf_torch/class.py index 62f04bfb..7d7dfc00 100644 --- a/modeling/inference_models/4bit_hf_torch/class.py +++ b/modeling/inference_models/4bit_hf_torch/class.py @@ -10,6 +10,7 @@ import sys from typing import Union from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer +import hf_bleeding_edge from hf_bleeding_edge import AutoModelForCausalLM import utils @@ -37,6 +38,13 @@ from gptq.opt import load_quant as opt_load_quant from gptq.mpt import load_quant as mpt_load_quant from gptq.offload import load_quant_offload +autogptq_support = True +try: + import auto_gptq + from auto_gptq import AutoGPTQForCausalLM +except ImportError: + autogptq_support = False + model_backend_name = "Huggingface GPTQ" @@ -212,6 +220,26 @@ class model_backend(HFTorchInferenceModel): model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) elif model_type == "mpt": model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + elif autogptq_support: + # Monkey patch in hf_bleeding_edge to avoid having to trust remote code + auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig + auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig + auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM + model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors")) + + # Patch in embeddings function + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + type(model).get_input_embeddings = get_input_embeddings + + # Patch in args support.. + def generate(self, *args, **kwargs): + """shortcut for model.generate""" + with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type): + return self.model.generate(*args, **kwargs) + + type(model).generate = generate else: raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") From 974328ed22ceca9a6e1a1c37ed135977c3429fee Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 4 Jun 2023 09:02:15 +0200 Subject: [PATCH 085/113] Add 4bit requirements to requirements.txt --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index c98b7252..0707cebe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,4 +37,7 @@ pytest-html==3.2.0 pytest-metadata==2.0.4 requests-mock==1.10.0 safetensors==0.3.1 +git+https://github.com/0cc4m/hf_bleeding_edge/ +--find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html gptq_koboldai==0.0.4 +einops peft==0.3.0 From 05a0bfe6c4bac8a1f7c070203cb69d1825a70e4e Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 4 Jun 2023 09:44:28 +0200 Subject: [PATCH 086/113] Don't show HF support if no HF model files are found --- .../inference_models/generic_hf_torch/class.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index a0b7b4cb..b56a7c45 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -9,6 +9,8 @@ from typing import Union from transformers import GPTNeoForCausalLM, GPT2LMHeadModel from hf_bleeding_edge import AutoModelForCausalLM +from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME + import utils import modeling.lazy_loader as lazy_loader import koboldai_settings @@ -27,6 +29,19 @@ model_backend_name = "Huggingface" class model_backend(HFTorchInferenceModel): + def is_valid(self, model_name, model_path, menu_path): + base_is_valid = super().is_valid(model_name, model_path, menu_path) + path = False + gen_path = "models/{}".format(model_name.replace('/', '_')) + if model_path is not None and os.path.exists(model_path): + path = model_path + elif os.path.exists(gen_path): + path = gen_path + + fnames = [WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME] + + return base_is_valid and any(os.path.exists(os.path.join(path, fname)) for fname in fnames) + def _initialize_model(self): return From c82625490a110bd5799463fa05f6ebc710e3516e Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 4 Jun 2023 12:31:24 +0200 Subject: [PATCH 087/113] Rename gptq backend folder --- .../inference_models/{4bit_hf_torch => gptq_hf_torch}/class.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename modeling/inference_models/{4bit_hf_torch => gptq_hf_torch}/class.py (100%) diff --git a/modeling/inference_models/4bit_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py similarity index 100% rename from modeling/inference_models/4bit_hf_torch/class.py rename to modeling/inference_models/gptq_hf_torch/class.py From b35f61e987841bd79dacdbe5c8b1cf6c75735f01 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 4 Jun 2023 15:40:12 +0200 Subject: [PATCH 088/113] Basic exllama plugin --- modeling/inference_models/exllama/class.py | 277 +++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 modeling/inference_models/exllama/class.py diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py new file mode 100644 index 00000000..0160ed4b --- /dev/null +++ b/modeling/inference_models/exllama/class.py @@ -0,0 +1,277 @@ +from __future__ import annotations + +import time, json +import torch +import requests +import numpy as np +from typing import List, Optional, Union +import os +import glob +from pathlib import Path +import re + +import utils +from logger import logger + +from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, + ModelCapabilities, +) + +from modeling.tokenizer import GenericTokenizer + +from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig +from transformers import LlamaTokenizer +from exllama.generator import ExLlamaGenerator + +import traceback + +model_backend_name = "ExLlama" + + +def load_model_gptq_settings(path): + try: + js = json.load(open(path + "/config.json", "r")) + except Exception as e: + return False + + gptq_model = False + gptq_file = False + + gptq_legacy_files = glob.glob(os.path.join(path, "4bit*.safetensors")) + if "gptq_bits" in js: + gptq_model = True + gptq_file = os.path.join(path, "model.safetensors") + elif gptq_legacy_files: + gptq_model = True + gptq_file = gptq_legacy_files[0] + fname = Path(gptq_file).parts[-1] + g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) + + return gptq_model, gptq_file + + +class model_backend(InferenceModel): + def __init__(self) -> None: + super().__init__() + self.model_config = None + + self.model = None + self.tokenizer = None + + self.model_name = None + self.path = None + + def is_valid(self, model_name, model_path, menu_path): + gptq_model, _ = load_model_gptq_settings(model_path) + try: + self.model_config = self._load_config(model_name, model_path) + return self.model_config and gptq_model + except: + return False + + def get_local_model_path(self): + return self.path or os.path.join("models", self.model_name.replace("/", "_")) + + def _load_config(self, model_name, model_path): + if model_path is not None and os.path.exists(model_path): + return ExLlamaConfig(os.path.join(model_path, "config.json")) + if(os.path.exists("models/{}".format(model_name.replace('/', '_')))): + return ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json")) + return False + + def _load(self, save_model: bool, initial_load: bool) -> None: + self.model = self._get_model(self.get_local_model_path(), {}) + self.tokenizer = self._get_tokenizer(os.path.join(self.get_local_model_path(), "tokenizer.model")) + + self.cache = ExLlamaCache(self.model) + + self.generator = ExLlamaGenerator(self.model, self.tokenizer.tokenizer, self.cache) + + def _post_load(self) -> None: + # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer + self.tokenizer.add_bos_token = False + + # HF transformers no longer supports decode_with_prefix_space + # We work around this by wrapping decode, encode, and __call__ + # with versions that work around the 'prefix space' misfeature + # of sentencepiece. + vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size)) + has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")} + + # Wrap 'decode' with a method that always returns text starting with a space + # when the head token starts with a space. This is what 'decode_with_prefix_space' + # used to do, and we implement it using the same technique (building a cache of + # tokens that should have a prefix space, and then prepending a space if the first + # token is in this set.) We also work around a bizarre behavior in which decoding + # a single token 13 behaves differently than decoding a squence containing only [13]. + original_decode = type(self.tokenizer.tokenizer).decode + def decode_wrapper(self, token_ids, *args, **kwargs): + first = None + # Note, the code below that wraps single-value token_ids in a list + # is to work around this wonky behavior: + # >>> t.decode(13) + # '<0x0A>' + # >>> t.decode([13]) + # '\n' + # Not doing this causes token streaming to receive <0x0A> characters + # instead of newlines. + if isinstance(token_ids, int): + first = token_ids + token_ids = [first] + elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor + # Tensors don't support the Python standard of 'empty is False' + # and the special case of dimension 0 tensors also needs to be + # handled separately. + if token_ids.dim() == 0: + first = int(token_ids.item()) + token_ids = [first] + elif len(token_ids) > 0: + first = int(token_ids[0]) + elif token_ids is not None and len(token_ids) > 0: + first = token_ids[0] + result = original_decode(self, token_ids, *args, **kwargs) + if first is not None and first in has_prefix_space: + result = " " + result + return result + # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it + object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer)) + + # Wrap encode and __call__ to work around the 'prefix space' misfeature also. + # The problem is that "Bob" at the start of text is encoded as if it is + # " Bob". This creates a problem because it means you can't split text, encode + # the pieces, concatenate the tokens, decode them, and get the original text back. + # The workaround is to prepend a known token that (1) starts with a space; and + # (2) is not the prefix of any other token. After searching through the vocab + # " ," (space comma) is the only token containing only printable ascii characters + # that fits this bill. By prepending ',' to the text, the original encode + # method always returns [1919, ...], where the tail of the sequence is the + # actual encoded result we want without the prefix space behavior. + original_encode = type(self.tokenizer.tokenizer).encode + def encode_wrapper(self, text, *args, **kwargs): + if type(text) is str: + text = ',' + text + result = original_encode(self, text, *args, **kwargs) + result = result[1:] + else: + result = original_encode(self, text, *args, **kwargs) + return result + object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer)) + + # Since 'encode' is documented as being deprecated, also override __call__. + # This doesn't appear to currently be used by KoboldAI, but doing so + # in case someone uses it in the future. + original_call = type(self.tokenizer.tokenizer).__call__ + def call_wrapper(self, text, *args, **kwargs): + if type(text) is str: + text = ',' + text + result = original_call(self, text, *args, **kwargs) + result = result[1:] + else: + result = original_call(self, text, *args, **kwargs) + return result + object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer)) + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + seed: Optional[int] = None, + **kwargs, + ) -> GenerationResult: + if not isinstance(prompt_tokens, torch.Tensor): + gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] + else: + gen_in = prompt_tokens + + self.generator.settings.temperature = max(gen_settings.temp, 0.01) + self.generator.settings.top_k = gen_settings.top_k if gen_settings.top_k > 0 else 10000 + self.generator.settings.top_p = gen_settings.top_p + self.generator.settings.min_p = 0.0 + + self.generator.gen_begin(gen_in) + + for i in range(max_new): + token = self.generator.gen_single_token() + if token.item() == self.tokenizer.eos_token_id: break + + return GenerationResult( + model=self, + out_batches=np.array( + self.generator.sequence[:, gen_in.size(1):], + ), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) + + def _get_model(self, location: str, tf_kwargs: Dict): + _, self.model_config.model_path = load_model_gptq_settings(location) + return ExLlama(self.model_config) + + def _get_tokenizer(self, location: str): + tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location)) + tokenizer._koboldai_header = tokenizer.encode("") + return tokenizer + + def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}): + requested_parameters = [] + gpu_count = torch.cuda.device_count() + layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None + requested_parameters.append({ + "uitype": "Valid Display", + "unit": "text", + "label": "Current Allocated Layers: %1/{}".format(layer_count), #%1 will be the validation value + "id": "valid_layers", + "max": layer_count, + "step": 1, + "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="}, + "menu_path": "Layers", + "extra_classes": "", + "refresh_model_inputs": False + }) + for i in range(gpu_count): + requested_parameters.append({ + "uitype": "slider", + "unit": "int", + "label": "{} Layers".format(torch.cuda.get_device_name(i)), + "id": "{}_Layers".format(i), + "min": 0, + "max": layer_count, + "step": 1, + "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="}, + "check_message": "The sum of assigned layers must equal {}".format(layer_count), + "default": [layer_count if i == 0 else 0], + "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)), + "menu_path": "Layers", + "extra_classes": "", + "refresh_model_inputs": False + }) + + return requested_parameters + + def set_input_parameters(self, parameters): + gpu_count = torch.cuda.device_count() + layers = [] + for i in range(gpu_count): + if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric(): + layers.append(int(parameters["{}_Layers".format(i)])) + elif isinstance(parameters["{}_Layers".format(i)], str): + layers.append(None) + else: + layers.append(parameters["{}_Layers".format(i)]) + + self.layers = layers + for i, l in enumerate(layers): + if l > 0: + self.model_config.device_map.layers.extend([f"cuda:{i}"] * l) + self.model_config.device_map.lm_head = "cuda:0" + self.model_config.device_map.norm = "cuda:0" + + self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id'] + self.path = parameters['path'] if 'path' in parameters else None From 94520d5c80c571f0ae97d92c7641f743cf566f6b Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 5 Jun 2023 18:43:57 +0200 Subject: [PATCH 089/113] Fix exllama model unload --- modeling/inference_models/exllama/class.py | 40 +++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 0160ed4b..db1728cf 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -9,6 +9,8 @@ import os import glob from pathlib import Path import re +import warnings +import gc import utils from logger import logger @@ -26,8 +28,6 @@ from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig from transformers import LlamaTokenizer from exllama.generator import ExLlamaGenerator -import traceback - model_backend_name = "ExLlama" @@ -60,8 +60,10 @@ class model_backend(InferenceModel): self.model = None self.tokenizer = None + self.cache = None + self.generator = None - self.model_name = None + self.model_name = "" self.path = None def is_valid(self, model_name, model_path, menu_path): @@ -84,7 +86,7 @@ class model_backend(InferenceModel): def _load(self, save_model: bool, initial_load: bool) -> None: self.model = self._get_model(self.get_local_model_path(), {}) - self.tokenizer = self._get_tokenizer(os.path.join(self.get_local_model_path(), "tokenizer.model")) + self.tokenizer = self._get_tokenizer(self.get_local_model_path())) self.cache = ExLlamaCache(self.model) @@ -174,6 +176,33 @@ class model_backend(InferenceModel): return result object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer)) + def unload(self): + self.model_config = None + + self.model = None + self.tokenizer = None + self.cache = None + self.generator = None + + self.model_name = "" + self.path = None + + with torch.no_grad(): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated") + for tensor in gc.get_objects(): + try: + if torch.is_tensor(tensor): + tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype)) + except: + pass + gc.collect() + try: + with torch.no_grad(): + torch.cuda.empty_cache() + except: + pass + def _raw_generate( self, prompt_tokens: Union[List[int], torch.Tensor], @@ -184,6 +213,9 @@ class model_backend(InferenceModel): seed: Optional[int] = None, **kwargs, ) -> GenerationResult: + if seed: + torch.manual_seed(seed) + if not isinstance(prompt_tokens, torch.Tensor): gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] else: From 39dfb1845570718d31490273bcb008718419b54e Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 6 Jun 2023 19:21:34 +0200 Subject: [PATCH 090/113] Replace exllama samplers with kobold's inbuilt ones --- modeling/inference_models/exllama/class.py | 56 +++++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index db1728cf..3ff38d33 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -15,6 +15,10 @@ import gc import utils from logger import logger +from modeling import warpers +from modeling.warpers import Warper +from modeling.stoppers import Stoppers +from modeling.post_token_hooks import PostTokenHooks from modeling.inference_model import ( GenerationResult, GenerationSettings, @@ -30,6 +34,11 @@ from exllama.generator import ExLlamaGenerator model_backend_name = "ExLlama" +# When set to true, messages will appear in the console if samplers are not +# changing the scores. Keep in mind some samplers don't always change the +# scores for each token. +LOG_SAMPLER_NO_EFFECT = False + def load_model_gptq_settings(path): try: @@ -86,7 +95,7 @@ class model_backend(InferenceModel): def _load(self, save_model: bool, initial_load: bool) -> None: self.model = self._get_model(self.get_local_model_path(), {}) - self.tokenizer = self._get_tokenizer(self.get_local_model_path())) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) self.cache = ExLlamaCache(self.model) @@ -203,6 +212,34 @@ class model_backend(InferenceModel): except: pass + def _apply_warpers( + self, scores: torch.Tensor, input_ids: torch.Tensor + ) -> torch.Tensor: + warpers.update_settings() + + if LOG_SAMPLER_NO_EFFECT: + pre = torch.Tensor(scores) + + for sid in utils.koboldai_vars.sampler_order: + warper = Warper.from_id(sid) + + if not warper.value_is_valid(): + continue + + if warper == warpers.RepetitionPenalty: + # Rep pen needs more data than other samplers + scores = warper.torch(scores, input_ids=input_ids) + else: + scores = warper.torch(scores) + + assert scores is not None, f"Scores are None; warper '{warper}' is to blame" + + if LOG_SAMPLER_NO_EFFECT: + if torch.equal(pre, scores): + logger.info(warper, "had no effect on the scores.") + pre = torch.Tensor(scores) + return scores + def _raw_generate( self, prompt_tokens: Union[List[int], torch.Tensor], @@ -228,8 +265,23 @@ class model_backend(InferenceModel): self.generator.gen_begin(gen_in) + # from pudb.remote import set_trace + # set_trace(term_size=(200, 60)) + for i in range(max_new): - token = self.generator.gen_single_token() + logits = self.model.forward(self.generator.sequence[:, -1:], self.cache) + logits[:, :, self.tokenizer.bos_token_id] = -10000.0 + + logits = torch.unsqueeze(logits[0, -1, :], 0) + + scores = self._apply_warpers(logits, gen_in) + + scores = torch.softmax(scores, dim=-1) + + token = torch.multinomial(scores, 1) + + self.generator.gen_accept_token(token) + if token.item() == self.tokenizer.eos_token_id: break return GenerationResult( From 47b371b9d3a21c341e1386c523ec87c760393ff7 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 6 Jun 2023 19:51:38 +0200 Subject: [PATCH 091/113] Fix multigpu --- modeling/inference_models/exllama/class.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 3ff38d33..b17d04bf 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -295,7 +295,11 @@ class model_backend(InferenceModel): ) def _get_model(self, location: str, tf_kwargs: Dict): + if not self.model_config: + ExLlamaConfig(os.path.join(location, "config.json")) + _, self.model_config.model_path = load_model_gptq_settings(location) + # self.model_config.gpu_peer_fix = True return ExLlama(self.model_config) def _get_tokenizer(self, location: str): @@ -351,6 +355,7 @@ class model_backend(InferenceModel): layers.append(parameters["{}_Layers".format(i)]) self.layers = layers + self.model_config.device_map.layers = [] for i, l in enumerate(layers): if l > 0: self.model_config.device_map.layers.extend([f"cuda:{i}"] * l) From 12df8220fb2d6122ee828c0910943a8e08c7ebb4 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 23 May 2023 06:59:28 +0200 Subject: [PATCH 092/113] Add gpt_bigcode support, fix 8-bit GPTQ incoherence --- docs/gptq-whl-links.html | 3 +++ environments/huggingface.yml | 2 +- modeling/inference_models/gptq_hf_torch/class.py | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html index 34d05691..0808dbc6 100644 --- a/docs/gptq-whl-links.html +++ b/docs/gptq-whl-links.html @@ -11,3 +11,6 @@ gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl + +gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl +gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl diff --git a/environments/huggingface.yml b/environments/huggingface.yml index acba0648..79258b60 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -47,6 +47,6 @@ dependencies: - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - gptq_koboldai==0.0.4 + - gptq_koboldai==0.0.5 - einops - peft==0.3.0 diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 7d7dfc00..0cc1da8d 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -35,6 +35,7 @@ from gptq.gptj import load_quant as gptj_load_quant from gptq.gptneox import load_quant as gptneox_load_quant from gptq.llama import load_quant as llama_load_quant from gptq.opt import load_quant as opt_load_quant +from gptq.bigcode import load_quant as bigcode_load_quant from gptq.mpt import load_quant as mpt_load_quant from gptq.offload import load_quant_offload @@ -220,6 +221,8 @@ class model_backend(HFTorchInferenceModel): model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) elif model_type == "mpt": model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + elif model_type == "gpt_bigcode": + model = load_quant_offload(bigcode_load_quant, location, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list).half() elif autogptq_support: # Monkey patch in hf_bleeding_edge to avoid having to trust remote code auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig From 0001ae00ab76e94a1743cbd8cdacc5f2483afce0 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 12 Jun 2023 07:18:22 +0200 Subject: [PATCH 093/113] Add v2 with bias support (e.g. for Tulu-30b) --- .../inference_models/gptq_hf_torch/class.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 0cc1da8d..d07aef23 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -96,17 +96,17 @@ def get_gptq_version(fpath): v2 = all([s in data for s in v3_strings]) if v2: - if v0 or v1: - logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}") - return 2 + if v0: + logger.warning(f"GPTQ model identified as v2, but v0={v0}") + return 2, v1 if v1: if v0 or v2: logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}") - return 1 + return 1, False if v0: if v1 or v2: logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}") - return 0 + return 0, False class model_backend(HFTorchInferenceModel): @@ -203,26 +203,27 @@ class model_backend(HFTorchInferenceModel): def _get_model(self, location: str, tf_kwargs: Dict): gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location) + v2_bias = False if gptq_version < 0: - gptq_version = get_gptq_version(gptq_file) + gptq_version, v2_bias = get_gptq_version(gptq_file) gptq.modelutils.set_gptq_version(gptq_version) model_type = self.get_model_type() - logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}, groupsize {gptq_groupsize}") + logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}") if model_type == "gptj": - model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) elif model_type == "gpt_neox": - model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) elif model_type == "llama": - model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) elif model_type == "opt": - model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) elif model_type == "mpt": - model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) elif model_type == "gpt_bigcode": - model = load_quant_offload(bigcode_load_quant, location, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list).half() + model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half() elif autogptq_support: # Monkey patch in hf_bleeding_edge to avoid having to trust remote code auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig From ebf7e2cf57efcab1a4998fc85029566700ce9497 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 12 Jun 2023 08:27:30 +0200 Subject: [PATCH 094/113] Update GPTQ module to 0.0.6 --- docs/gptq-whl-links.html | 3 +++ environments/huggingface.yml | 2 +- environments/rocm.yml | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html index 0808dbc6..b993d9bd 100644 --- a/docs/gptq-whl-links.html +++ b/docs/gptq-whl-links.html @@ -14,3 +14,6 @@ gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl + +gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl +gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 79258b60..2c996ff9 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -47,6 +47,6 @@ dependencies: - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - gptq_koboldai==0.0.5 + - gptq_koboldai==0.0.6 - einops - peft==0.3.0 diff --git a/environments/rocm.yml b/environments/rocm.yml index 7ef282cc..b85cfd74 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -42,5 +42,6 @@ dependencies: - pydub - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ + - git+https://github.com/0cc4m/GPTQ-for-LLaMa@0.0.6 - einops - peft==0.3.0 From 0c7eaefb1acc522eeed0b2dc1af78ec894b84a8b Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 13 Jun 2023 10:11:29 +0200 Subject: [PATCH 095/113] Fix AMD ROCm exllama inference --- modeling/inference_models/exllama/class.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index b17d04bf..37681b4f 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -362,5 +362,10 @@ class model_backend(InferenceModel): self.model_config.device_map.lm_head = "cuda:0" self.model_config.device_map.norm = "cuda:0" + self.model_config.rmsnorm_no_half2 = bool(torch.version.hip) + self.model_config.rope_no_half2 = bool(torch.version.hip) + self.model_config.matmul_no_half2 = bool(torch.version.hip) + self.model_config.silu_no_half2 = bool(torch.version.hip) + self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id'] self.path = parameters['path'] if 'path' in parameters else None From e874f0c1c26501a0c2592b3acde8a3a271a7c50d Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 19 Jun 2023 19:05:31 +0200 Subject: [PATCH 096/113] Add token streaming support for exllama --- modeling/inference_models/exllama/class.py | 26 ++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 37681b4f..614a3de1 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -75,6 +75,25 @@ class model_backend(InferenceModel): self.model_name = "" self.path = None + self.post_token_hooks = [ + PostTokenHooks.stream_tokens, + ] + + self.stopper_hooks = [ + Stoppers.core_stopper, + Stoppers.dynamic_wi_scanner, + Stoppers.singleline_stopper, + Stoppers.chat_mode_stopper, + Stoppers.stop_sequence_stopper, + ] + + self.capabilties = ModelCapabilities( + embedding_manipulation=False, + post_token_hooks=True, + stopper_hooks=False, + post_token_probs=False, + ) + def is_valid(self, model_name, model_path, menu_path): gptq_model, _ = load_model_gptq_settings(model_path) try: @@ -265,11 +284,8 @@ class model_backend(InferenceModel): self.generator.gen_begin(gen_in) - # from pudb.remote import set_trace - # set_trace(term_size=(200, 60)) - for i in range(max_new): - logits = self.model.forward(self.generator.sequence[:, -1:], self.cache) + logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache) logits[:, :, self.tokenizer.bos_token_id] = -10000.0 logits = torch.unsqueeze(logits[0, -1, :], 0) @@ -282,6 +298,8 @@ class model_backend(InferenceModel): self.generator.gen_accept_token(token) + self._post_token_gen(self.generator.sequence) + if token.item() == self.tokenizer.eos_token_id: break return GenerationResult( From a191855b37407f91f03576814a1cb4b548100183 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 19 Jun 2023 19:14:04 +0200 Subject: [PATCH 097/113] Track token generation progress --- modeling/inference_models/exllama/class.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 614a3de1..811f8da1 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -300,8 +300,12 @@ class model_backend(InferenceModel): self._post_token_gen(self.generator.sequence) + utils.koboldai_vars.generated_tkns += 1 + if token.item() == self.tokenizer.eos_token_id: break + utils.koboldai_vars.generated_tkns = max_new + return GenerationResult( model=self, out_batches=np.array( From e8741a1b5709f98187fb6ecd3d3d35fa0b9cd57c Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 20 Jun 2023 09:19:43 +0200 Subject: [PATCH 098/113] Disable scaled_dot_product_attention if torch version < 2 --- modeling/inference_models/exllama/class.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 811f8da1..995f5874 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -282,7 +282,7 @@ class model_backend(InferenceModel): self.generator.settings.top_p = gen_settings.top_p self.generator.settings.min_p = 0.0 - self.generator.gen_begin(gen_in) + self.generator.gen_begin_reuse(gen_in) for i in range(max_new): logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache) @@ -384,10 +384,15 @@ class model_backend(InferenceModel): self.model_config.device_map.lm_head = "cuda:0" self.model_config.device_map.norm = "cuda:0" + # Disable half2 for HIP self.model_config.rmsnorm_no_half2 = bool(torch.version.hip) self.model_config.rope_no_half2 = bool(torch.version.hip) self.model_config.matmul_no_half2 = bool(torch.version.hip) self.model_config.silu_no_half2 = bool(torch.version.hip) + # Disable scaled_dot_product_attention if torch version < 2 + if torch.__version__.startswith("1."): + self.model_config.sdp_thd = 0 + self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id'] self.path = parameters['path'] if 'path' in parameters else None From adad81639dbe9867039c25874c73d801fb48df86 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 21 Jun 2023 15:47:46 +0200 Subject: [PATCH 099/113] Remove rocm gptq install from environments file --- environments/rocm.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environments/rocm.yml b/environments/rocm.yml index b85cfd74..7ef282cc 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -42,6 +42,5 @@ dependencies: - pydub - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - - git+https://github.com/0cc4m/GPTQ-for-LLaMa@0.0.6 - einops - peft==0.3.0 From c753671ac14850a2528c0e1028816a12ca8005ac Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 27 Jun 2023 07:39:37 +0200 Subject: [PATCH 100/113] Add exllama superhot positional embeddings compression support --- modeling/inference_models/exllama/class.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 995f5874..19478cc8 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -106,11 +106,18 @@ class model_backend(InferenceModel): return self.path or os.path.join("models", self.model_name.replace("/", "_")) def _load_config(self, model_name, model_path): + config = False if model_path is not None and os.path.exists(model_path): - return ExLlamaConfig(os.path.join(model_path, "config.json")) - if(os.path.exists("models/{}".format(model_name.replace('/', '_')))): - return ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json")) - return False + config = ExLlamaConfig(os.path.join(model_path, "config.json")) + if not config and os.path.exists("models/{}".format(model_name.replace('/', '_'))): + config = ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json")) + + if config and "superhot" in model_name.lower(): + # Set compress_pos_emb factor + config.max_seq_len = 8192 + config.compress_pos_emb = 4.0 + + return config def _load(self, save_model: bool, initial_load: bool) -> None: self.model = self._get_model(self.get_local_model_path(), {}) @@ -277,11 +284,6 @@ class model_backend(InferenceModel): else: gen_in = prompt_tokens - self.generator.settings.temperature = max(gen_settings.temp, 0.01) - self.generator.settings.top_k = gen_settings.top_k if gen_settings.top_k > 0 else 10000 - self.generator.settings.top_p = gen_settings.top_p - self.generator.settings.min_p = 0.0 - self.generator.gen_begin_reuse(gen_in) for i in range(max_new): From 0e4b6571d5f5fb1104fdea7194f2f2913ef243ec Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 28 Jun 2023 22:50:04 +0200 Subject: [PATCH 101/113] Fix non-tuple return from gptq function --- modeling/inference_models/exllama/class.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 19478cc8..1caa2afd 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -44,7 +44,7 @@ def load_model_gptq_settings(path): try: js = json.load(open(path + "/config.json", "r")) except Exception as e: - return False + return False, False gptq_model = False gptq_file = False From ed7ad00b593f431af59a3cd7315dc36ca2940c6f Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 15 Jul 2023 22:55:17 +0200 Subject: [PATCH 102/113] Move GPTQ readme changes to separate file --- README.md | 52 -------------------------------------------------- README_GPTQ.md | 50 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 52 deletions(-) create mode 100644 README_GPTQ.md diff --git a/README.md b/README.md index 5f4bc5c7..789b78d1 100644 --- a/README.md +++ b/README.md @@ -1,55 +1,3 @@ -## This is a fork of KoboldAI that implements 4bit GPTQ quantized support to include Llama. - -### Install/Use Guide -(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use) - -#### Installation -In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created. - -Note: do not run your command prompt as administrator/with elevated priviledges, reports suggest this leads to problems. - -`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules` - -`cd KoboldAI` - -Next step, (Windows) subfolder mode or B: option doesn't matter choose either - -* [if on Windows] - ``` - install_requirements.bat - ``` - * if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. - -* [if on Linux with Nvidia] - ``` - ./install_requirements.sh - ``` -* [if on Linux with AMD] - ``` - ./install_requirements.sh rocm - ./commandline-rocm.sh - pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa@c884b421a233f9603d8224c9b22c2d83dd2c1fc4 - ``` - * If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed - * If you get CUDA_HOME envar is not set run in env: - `pip3 install torch --index-url https://download.pytorch.org/whl/rocm5.4.2 --force-reinstall` - -#### Setting up models -If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder) - -Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). - -Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-g.pt` or `4bit-.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`) - -So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, it'll load the quantized model. - -#### Running KoboldAI and loading 4bit models -If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux) - -Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD] - -Switch to UI2, then load your model. - ## KoboldAI - Your gateway to GPT writing This is a browser-based front-end for AI-assisted writing with multiple local & remote AI models. It offers the standard array of tools, including Memory, Author's Note, World Info, Save & Load, adjustable AI settings, formatting options, and the ability to import existing AI Dungeon adventures. You can also turn on Adventure mode and play the game like AI Dungeon Unleashed. diff --git a/README_GPTQ.md b/README_GPTQ.md new file mode 100644 index 00000000..e1961cb8 --- /dev/null +++ b/README_GPTQ.md @@ -0,0 +1,50 @@ +### Install/Use Guide +(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use) + +#### Installation +In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created. + +Note: do not run your command prompt as administrator/with elevated priviledges, reports suggest this leads to problems. + +`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules` + +`cd KoboldAI` + +Next step, (Windows) subfolder mode or B: option doesn't matter choose either + +* [if on Windows] + ``` + install_requirements.bat + ``` + * if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. + +* [if on Linux with Nvidia] + ``` + ./install_requirements.sh + ``` +* [if on Linux with AMD] + ``` + ./install_requirements.sh rocm + ./commandline-rocm.sh + pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa@c884b421a233f9603d8224c9b22c2d83dd2c1fc4 + ``` + * If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed + * If you get CUDA_HOME envar is not set run in env: + `pip3 install torch --index-url https://download.pytorch.org/whl/rocm5.4.2 --force-reinstall` + +#### Setting up models +If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder) + +Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). + +Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-g.pt` or `4bit-.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`) + +So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, it'll load the quantized model. + +#### Running KoboldAI and loading 4bit models +If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux) + +Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD] + +Switch to UI2, then load your model. + From 9aa6c5fbbfcb9a2f22f38fc9baa07e5baa033361 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 19 Jul 2023 06:56:09 +0200 Subject: [PATCH 103/113] Merge upstream changes, fix conflict, adapt backends to changes --- modeling/inference_models/exllama/class.py | 1 + .../inference_models/gptq_hf_torch/class.py | 50 ++++++------------- 2 files changed, 16 insertions(+), 35 deletions(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 1caa2afd..21eba58e 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -32,6 +32,7 @@ from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig from transformers import LlamaTokenizer from exllama.generator import ExLlamaGenerator +model_backend_type = "GPTQ" model_backend_name = "ExLlama" # When set to true, messages will appear in the console if samplers are not diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index d07aef23..16d3db91 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -18,13 +18,6 @@ import modeling.lazy_loader as lazy_loader import koboldai_settings from logger import logger, set_logger_verbosity -try: - import breakmodel -except ModuleNotFoundError as e: - # Breakmodel is only expected to work on GPU - if not utils.koboldai_vars.use_colab_tpu: - raise e - from modeling.inference_models.hf_torch import HFTorchInferenceModel from modeling.tokenizer import GenericTokenizer @@ -47,6 +40,7 @@ except ImportError: autogptq_support = False +model_backend_type = "GPTQ" model_backend_name = "Huggingface GPTQ" @@ -112,7 +106,7 @@ def get_gptq_version(fpath): class model_backend(HFTorchInferenceModel): def is_valid(self, model_name, model_path, menu_path): gptq_model, _, _, _, _ = load_model_gptq_settings(model_path) - return gptq_model + return bool(gptq_model) def _load(self, save_model: bool, initial_load: bool) -> None: # Make model path the same as the model name to make this consistent @@ -126,7 +120,7 @@ class model_backend(HFTorchInferenceModel): self.lazy_load = False - gpulayers = breakmodel.gpu_blocks + gpulayers = self.breakmodel_config.gpu_blocks try: self.gpu_layers_list = [int(l) for l in gpulayers.split(",")] @@ -149,42 +143,28 @@ class model_backend(HFTorchInferenceModel): self.breakmodel_device_config(self.model_config) if self.lazy_load: + # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time + tf_kwargs.pop("low_cpu_mem_usage", None) + # If we're using lazy loader, we need to figure out what the model's hidden layers are called - with lazy_loader.use_lazy_load( - dematerialized_modules=True, use_accelerate_init_empty_weights=True - ): + with lazy_loader.use_lazy_load(dematerialized_modules=True): try: metamodel = AutoModelForCausalLM.from_config(self.model_config) utils.layers_module_names = utils.get_layers_module_names(metamodel) utils.module_names = list(metamodel.state_dict().keys()) utils.named_buffers = list(metamodel.named_buffers(recurse=True)) except Exception as e: + if utils.args.panic: + raise e logger.warning(f"Gave up on lazy loading due to {e}") self.lazy_load = False - # Download model from Huggingface if it does not exist, otherwise load locally - with self._maybe_use_float16(), lazy_loader.use_lazy_load( - enable=self.lazy_load, - callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) - if self.lazy_load - else None, - dematerialized_modules=True, - ): - if self.lazy_load: - # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time - tf_kwargs.pop("low_cpu_mem_usage", None) - - if self.get_local_model_path(): - # Model is stored locally, load it. - self.model = self._get_model(self.get_local_model_path(), tf_kwargs) - self.tokenizer = self._get_tokenizer(self.get_local_model_path()) - else: - raise NotImplementedError("GPTQ Model downloading not implemented") - - if not self.lazy_load: - utils.layers_module_names = utils.get_layers_module_names(self.model) - utils.module_names = list(self.model.state_dict().keys()) - utils.named_buffers = list(self.model.named_buffers(recurse=True)) + if self.get_local_model_path(): + # Model is stored locally, load it. + self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + else: + raise NotImplementedError("GPTQ Model downloading not implemented") if ( utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default From c84d063be880672a30ecd1a8a48791b6ab12685e Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 19 Jul 2023 07:01:11 +0200 Subject: [PATCH 104/113] Revert settings changes --- koboldai_settings.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/koboldai_settings.py b/koboldai_settings.py index 8ab134fa..ebd8c019 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -919,7 +919,7 @@ class story_settings(settings): # In percent!!! self.commentary_chance = 0 self.commentary_enabled = False - + self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled")) ################### must be at bottom ######################### @@ -1206,12 +1206,12 @@ class system_settings(settings): local_only_variables = ['lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 'lua_koboldcore', 'regex_sl', 'acregex_ai', 'acregex_ui', 'comregex_ai', 'comregex_ui', 'sp', '_horde_pid', 'inference_config', 'image_pipeline', - 'summarizer', 'summary_tokenizer', 'tts_model', 'rng_states', 'comregex_ai', 'comregex_ui'] + 'summarizer', 'summary_tokenizer', 'tts_model', 'rng_states', 'comregex_ai', 'comregex_ui', 'colab_arg'] no_save_variables = ['lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 'on_colab' 'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model', - 'generating_image', 'bit_8_available', 'bit_4_available', 'host', 'hascuda', 'usegpu', 'rng_states', 'comregex_ai', 'comregex_ui', 'git_repository', 'git_branch'] + 'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states', 'comregex_ai', 'comregex_ui', 'git_repository', 'git_branch', 'colab_arg'] settings_name = "system" def __init__(self, socketio, koboldai_var): self._socketio = socketio @@ -1279,11 +1279,12 @@ class system_settings(settings): self.disable_output_formatting = False self.api_tokenizer_id = None self.port = 5000 + self.colab_arg = False try: import google.colab self.on_colab = True except: - self.on_colab = False + self.on_colab = self.colab_arg print(f"Colab Check: {self.on_colab}, TPU: {self.use_colab_tpu}") self.horde_share = False self._horde_pid = None @@ -1294,13 +1295,6 @@ class system_settings(settings): self.keep_img_gen_in_memory = False self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost self.experimental_features = False - # Check if repos/gptq exists for 4-bit mode - self.bit_4_available = True - try: - import gptq - except ImportError: - self.bit_4_available = False - self.seen_messages = [] self.git_repository = "" self.git_branch = "" From 1c5da2bbf3b1dbe71599449e7953df3fc06ab301 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 19 Jul 2023 07:08:39 +0200 Subject: [PATCH 105/113] Move pip docs from KoboldAI into GPTQ repo --- docs/gptq-whl-links.html | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 docs/gptq-whl-links.html diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html deleted file mode 100644 index b993d9bd..00000000 --- a/docs/gptq-whl-links.html +++ /dev/null @@ -1,19 +0,0 @@ -gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl -gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl -gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl - -gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl -gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl -gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl - -gptq_koboldai-0.0.3-cp38-cp38-linux_x86_64.whl -gptq_koboldai-0.0.3-cp38-cp38-win_amd64.whl - -gptq_koboldai-0.0.4-cp38-cp38-linux_x86_64.whl -gptq_koboldai-0.0.4-cp38-cp38-win_amd64.whl - -gptq_koboldai-0.0.5-cp38-cp38-linux_x86_64.whl -gptq_koboldai-0.0.5-cp38-cp38-win_amd64.whl - -gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl -gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl From 19f511dc9f11c2f68f1c697f8d6b4f0521335c54 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 19 Jul 2023 07:12:37 +0200 Subject: [PATCH 106/113] Load GPTQ module from GPTQ repo docs --- environments/huggingface.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index c4cccf98..2cbefe7f 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -47,7 +47,7 @@ dependencies: - pydub - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html + - --find-links=https://0cc4m.github.io/GPTQ-for-LLaMa/gptq-whl-links.html - gptq_koboldai==0.0.6 - einops - peft==0.3.0 From 58908ab846f44671533a66fb866bedbc45a60198 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 19 Jul 2023 07:14:03 +0200 Subject: [PATCH 107/113] Revert aiserver.py changes --- aiserver.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/aiserver.py b/aiserver.py index 88a76454..0aa9bd4c 100644 --- a/aiserver.py +++ b/aiserver.py @@ -50,8 +50,6 @@ import multiprocessing import numpy as np from collections import OrderedDict from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type -import glob -from pathlib import Path import requests import html @@ -1087,6 +1085,8 @@ def getmodelname(): def get_hidden_size_from_model(model): return model.get_input_embeddings().embedding_dim + + #==================================================================# # Allow the models to override some settings #==================================================================# @@ -1162,7 +1162,6 @@ def loadmodelsettings(): if(not koboldai_vars.gamestarted): koboldai_vars.authornotetemplate = koboldai_vars.setauthornotetemplate - #==================================================================# # Take settings from koboldai_vars and write them to client settings file #==================================================================# @@ -1594,7 +1593,8 @@ def general_startup(override_args=None): if koboldai_vars.use_colab_tpu and args.model_backend == "Huggingface": args.model_backend = "Huggingface MTJ" - + + if args.model: # At this point we have to try to load the model through the selected backend if args.model_backend not in model_backends: @@ -1761,7 +1761,8 @@ def load_model(model_backend, initial_load=False): if 'model' in globals(): model.unload() - + + # If transformers model was selected & GPU available, ask to use CPU or GPU if(not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): # loadmodelsettings() @@ -1783,6 +1784,8 @@ def load_model(model_backend, initial_load=False): else: koboldai_vars.default_preset = koboldai_settings.default_preset + + with use_custom_unpickler(RestrictedUnpickler): model = model_backends[model_backend] model.load(initial_load=initial_load, save_model=not (args.colab or args.cacheonly) or args.savemodel) @@ -1791,7 +1794,7 @@ def load_model(model_backend, initial_load=False): koboldai_vars.model = os.path.basename(os.path.normpath(model.path)) logger.info(koboldai_vars.model) logger.debug("Model Type: {}".format(koboldai_vars.model_type)) - + # TODO: Convert everywhere to use model.tokenizer if model: tokenizer = model.tokenizer From 748e5ef318095d2d6f47ed6da3272699c96088af Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 23 Jul 2023 07:11:28 +0200 Subject: [PATCH 108/113] Add sliders for exllama context size and related methods --- modeling/inference_models/exllama/class.py | 58 ++++++++++++++++--- .../inference_models/gptq_hf_torch/class.py | 4 +- 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 21eba58e..aa37a7aa 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -50,7 +50,7 @@ def load_model_gptq_settings(path): gptq_model = False gptq_file = False - gptq_legacy_files = glob.glob(os.path.join(path, "4bit*.safetensors")) + gptq_legacy_files = glob.glob(os.path.join(path, "*4bit*.safetensors")) if "gptq_bits" in js: gptq_model = True gptq_file = os.path.join(path, "model.safetensors") @@ -58,7 +58,7 @@ def load_model_gptq_settings(path): gptq_model = True gptq_file = gptq_legacy_files[0] fname = Path(gptq_file).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) + g = re.findall("(?:4bit)(?:-)(\\d+)(?:g-?)", fname) return gptq_model, gptq_file @@ -113,11 +113,6 @@ class model_backend(InferenceModel): if not config and os.path.exists("models/{}".format(model_name.replace('/', '_'))): config = ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json")) - if config and "superhot" in model_name.lower(): - # Set compress_pos_emb factor - config.max_seq_len = 8192 - config.compress_pos_emb = 4.0 - return config def _load(self, save_model: bool, initial_load: bool) -> None: @@ -366,6 +361,51 @@ class model_backend(InferenceModel): "refresh_model_inputs": False }) + requested_parameters.append({ + "uitype": "slider", + "unit": "int", + "label": "Maximum Context", + "id": "max_ctx", + "min": 2048, + "max": 16384, + "step": 512, + "default": 2048, + "tooltip": "The maximum context size the model supports", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + + requested_parameters.append({ + "uitype": "slider", + "unit": "float", + "label": "Embedding Compression", + "id": "compress_emb", + "min": 1, + "max": 8, + "step": 0.25, + "default": 1, + "tooltip": "If the model requires compressed embeddings, set them here", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + + requested_parameters.append({ + "uitype": "slider", + "unit": "float", + "label": "NTK alpha", + "id": "ntk_alpha", + "min": 1, + "max": 32, + "step": 0.25, + "default": 1, + "tooltip": "NTK alpha value", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + return requested_parameters def set_input_parameters(self, parameters): @@ -387,6 +427,10 @@ class model_backend(InferenceModel): self.model_config.device_map.lm_head = "cuda:0" self.model_config.device_map.norm = "cuda:0" + self.model_config.max_seq_len = parameters["max_ctx"] + self.model_config.compress_pos_emb = parameters["compress_emb"] + self.model_config.alpha_value = parameters["ntk_alpha"] + # Disable half2 for HIP self.model_config.rmsnorm_no_half2 = bool(torch.version.hip) self.model_config.rope_no_half2 = bool(torch.version.hip) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 16d3db91..157ebdbe 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -56,7 +56,7 @@ def load_model_gptq_settings(path): gptq_file = False gptq_version = -1 - gptq_legacy_files = glob.glob(os.path.join(path, "4bit*.pt")) + glob.glob(os.path.join(path, "4bit*.safetensors")) + gptq_legacy_files = glob.glob(os.path.join(path, "*4bit*.pt")) + glob.glob(os.path.join(path, "*4bit*.safetensors")) if "gptq_bits" in js: gptq_model = True gptq_bits = js["gptq_bits"] @@ -70,7 +70,7 @@ def load_model_gptq_settings(path): gptq_bits = 4 gptq_file = gptq_legacy_files[0] fname = Path(gptq_file).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) + g = re.findall("(?:4bit)(?:-)(\\d+)(?:g-?)", fname) gptq_groupsize = int(g[0]) if g else -1 gptq_version = -1 From 09bb1021ddc548e4422d6426fe2c1867b6d152b8 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 23 Jul 2023 07:14:23 +0200 Subject: [PATCH 109/113] Fallback to transformers if hf_bleeding_edge not available --- modeling/inference_models/generic_hf_torch/class.py | 5 ++++- modeling/inference_models/gptq_hf_torch/class.py | 7 +++++-- modeling/inference_models/hf.py | 5 ++++- modeling/inference_models/hf_torch.py | 5 ++++- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index de89034b..5471ae43 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -7,7 +7,10 @@ import shutil from typing import Union from transformers import GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig -from hf_bleeding_edge import AutoModelForCausalLM +try: + from hf_bleeding_edge import AutoModelForCausalLM +except ImportError: + from transformers import AutoModelForCausalLM from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 157ebdbe..0819c8ae 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -10,8 +10,11 @@ import sys from typing import Union from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer -import hf_bleeding_edge -from hf_bleeding_edge import AutoModelForCausalLM +try: + import hf_bleeding_edge + from hf_bleeding_edge import AutoModelForCausalLM +except ImportError: + from transformers import AutoModelForCausalLM import utils import modeling.lazy_loader as lazy_loader diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index cd55c3ef..be0fb059 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -1,6 +1,9 @@ import os, sys from typing import Optional -from hf_bleeding_edge import AutoConfig +try: + from hf_bleeding_edge import AutoConfig +except ImportError: + from transformers import AutoConfig import warnings import utils diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index f7bd7a0b..6372858f 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -19,7 +19,10 @@ from transformers import ( GPT2LMHeadModel, LogitsProcessorList, ) -from hf_bleeding_edge import AutoModelForCausalLM +try: + from hf_bleeding_edge import AutoModelForCausalLM +except ImportError: + from transformers import AutoModelForCausalLM import utils import modeling.lazy_loader as lazy_loader From 31a984aa3d3c37e44b6114d3d5196167940181ee Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 23 Jul 2023 07:33:51 +0200 Subject: [PATCH 110/113] Automatically install exllama module --- environments/huggingface.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 82ea8f9b..e97f3e2e 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -52,3 +52,5 @@ dependencies: - einops - peft==0.3.0 - scipy + - --find-links=https://0cc4m.github.io/exllama/exllama-whl-links.html + - exllama==0.0.6 From 49740aa5abf406f7b9f6a60e60e23815c3f7007f Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 23 Jul 2023 21:56:48 +0200 Subject: [PATCH 111/113] Fix ntk alpha --- modeling/inference_models/exllama/class.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index aa37a7aa..e3c7a874 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -430,6 +430,7 @@ class model_backend(InferenceModel): self.model_config.max_seq_len = parameters["max_ctx"] self.model_config.compress_pos_emb = parameters["compress_emb"] self.model_config.alpha_value = parameters["ntk_alpha"] + self.model_config.calculate_rotary_embedding_base() # Disable half2 for HIP self.model_config.rmsnorm_no_half2 = bool(torch.version.hip) From 973aea12ea079e9c5de1e418b848a0407da7eab7 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 23 Jul 2023 22:07:34 +0200 Subject: [PATCH 112/113] Only import big python modules for GPTQ once they get used --- .../inference_models/gptq_hf_torch/class.py | 50 +++++++++---------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 0819c8ae..81a33c70 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -9,13 +9,6 @@ import shutil import sys from typing import Union -from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer -try: - import hf_bleeding_edge - from hf_bleeding_edge import AutoModelForCausalLM -except ImportError: - from transformers import AutoModelForCausalLM - import utils import modeling.lazy_loader as lazy_loader import koboldai_settings @@ -24,23 +17,7 @@ from logger import logger, set_logger_verbosity from modeling.inference_models.hf_torch import HFTorchInferenceModel from modeling.tokenizer import GenericTokenizer -# 4-bit dependencies -import gptq from pathlib import Path -from gptq.gptj import load_quant as gptj_load_quant -from gptq.gptneox import load_quant as gptneox_load_quant -from gptq.llama import load_quant as llama_load_quant -from gptq.opt import load_quant as opt_load_quant -from gptq.bigcode import load_quant as bigcode_load_quant -from gptq.mpt import load_quant as mpt_load_quant -from gptq.offload import load_quant_offload - -autogptq_support = True -try: - import auto_gptq - from auto_gptq import AutoGPTQForCausalLM -except ImportError: - autogptq_support = False model_backend_type = "GPTQ" @@ -185,6 +162,15 @@ class model_backend(HFTorchInferenceModel): utils.koboldai_vars.modeldim = self.get_hidden_size() def _get_model(self, location: str, tf_kwargs: Dict): + import gptq + from gptq.gptj import load_quant as gptj_load_quant + from gptq.gptneox import load_quant as gptneox_load_quant + from gptq.llama import load_quant as llama_load_quant + from gptq.opt import load_quant as opt_load_quant + from gptq.bigcode import load_quant as bigcode_load_quant + from gptq.mpt import load_quant as mpt_load_quant + from gptq.offload import load_quant_offload + gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location) v2_bias = False @@ -207,7 +193,19 @@ class model_backend(HFTorchInferenceModel): model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) elif model_type == "gpt_bigcode": model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half() - elif autogptq_support: + else: + try: + import auto_gptq + from auto_gptq import AutoGPTQForCausalLM + except ImportError: + raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") + + try: + import hf_bleeding_edge + from hf_bleeding_edge import AutoModelForCausalLM + except ImportError: + from transformers import AutoModelForCausalLM + # Monkey patch in hf_bleeding_edge to avoid having to trust remote code auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig @@ -227,12 +225,12 @@ class model_backend(HFTorchInferenceModel): return self.model.generate(*args, **kwargs) type(model).generate = generate - else: - raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") return model def _get_tokenizer(self, location: str): + from transformers import AutoTokenizer, LlamaTokenizer + model_type = self.get_model_type() if model_type == "llama": tokenizer = LlamaTokenizer.from_pretrained(location) From 73953068c0e0752094843e17151471056aa132f2 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 23 Jul 2023 22:12:31 +0200 Subject: [PATCH 113/113] Remove exllama backend, pending further fixes --- modeling/inference_models/exllama/class.py | 446 --------------------- 1 file changed, 446 deletions(-) delete mode 100644 modeling/inference_models/exllama/class.py diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py deleted file mode 100644 index e3c7a874..00000000 --- a/modeling/inference_models/exllama/class.py +++ /dev/null @@ -1,446 +0,0 @@ -from __future__ import annotations - -import time, json -import torch -import requests -import numpy as np -from typing import List, Optional, Union -import os -import glob -from pathlib import Path -import re -import warnings -import gc - -import utils -from logger import logger - -from modeling import warpers -from modeling.warpers import Warper -from modeling.stoppers import Stoppers -from modeling.post_token_hooks import PostTokenHooks -from modeling.inference_model import ( - GenerationResult, - GenerationSettings, - InferenceModel, - ModelCapabilities, -) - -from modeling.tokenizer import GenericTokenizer - -from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig -from transformers import LlamaTokenizer -from exllama.generator import ExLlamaGenerator - -model_backend_type = "GPTQ" -model_backend_name = "ExLlama" - -# When set to true, messages will appear in the console if samplers are not -# changing the scores. Keep in mind some samplers don't always change the -# scores for each token. -LOG_SAMPLER_NO_EFFECT = False - - -def load_model_gptq_settings(path): - try: - js = json.load(open(path + "/config.json", "r")) - except Exception as e: - return False, False - - gptq_model = False - gptq_file = False - - gptq_legacy_files = glob.glob(os.path.join(path, "*4bit*.safetensors")) - if "gptq_bits" in js: - gptq_model = True - gptq_file = os.path.join(path, "model.safetensors") - elif gptq_legacy_files: - gptq_model = True - gptq_file = gptq_legacy_files[0] - fname = Path(gptq_file).parts[-1] - g = re.findall("(?:4bit)(?:-)(\\d+)(?:g-?)", fname) - - return gptq_model, gptq_file - - -class model_backend(InferenceModel): - def __init__(self) -> None: - super().__init__() - self.model_config = None - - self.model = None - self.tokenizer = None - self.cache = None - self.generator = None - - self.model_name = "" - self.path = None - - self.post_token_hooks = [ - PostTokenHooks.stream_tokens, - ] - - self.stopper_hooks = [ - Stoppers.core_stopper, - Stoppers.dynamic_wi_scanner, - Stoppers.singleline_stopper, - Stoppers.chat_mode_stopper, - Stoppers.stop_sequence_stopper, - ] - - self.capabilties = ModelCapabilities( - embedding_manipulation=False, - post_token_hooks=True, - stopper_hooks=False, - post_token_probs=False, - ) - - def is_valid(self, model_name, model_path, menu_path): - gptq_model, _ = load_model_gptq_settings(model_path) - try: - self.model_config = self._load_config(model_name, model_path) - return self.model_config and gptq_model - except: - return False - - def get_local_model_path(self): - return self.path or os.path.join("models", self.model_name.replace("/", "_")) - - def _load_config(self, model_name, model_path): - config = False - if model_path is not None and os.path.exists(model_path): - config = ExLlamaConfig(os.path.join(model_path, "config.json")) - if not config and os.path.exists("models/{}".format(model_name.replace('/', '_'))): - config = ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json")) - - return config - - def _load(self, save_model: bool, initial_load: bool) -> None: - self.model = self._get_model(self.get_local_model_path(), {}) - self.tokenizer = self._get_tokenizer(self.get_local_model_path()) - - self.cache = ExLlamaCache(self.model) - - self.generator = ExLlamaGenerator(self.model, self.tokenizer.tokenizer, self.cache) - - def _post_load(self) -> None: - # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer - self.tokenizer.add_bos_token = False - - # HF transformers no longer supports decode_with_prefix_space - # We work around this by wrapping decode, encode, and __call__ - # with versions that work around the 'prefix space' misfeature - # of sentencepiece. - vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size)) - has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")} - - # Wrap 'decode' with a method that always returns text starting with a space - # when the head token starts with a space. This is what 'decode_with_prefix_space' - # used to do, and we implement it using the same technique (building a cache of - # tokens that should have a prefix space, and then prepending a space if the first - # token is in this set.) We also work around a bizarre behavior in which decoding - # a single token 13 behaves differently than decoding a squence containing only [13]. - original_decode = type(self.tokenizer.tokenizer).decode - def decode_wrapper(self, token_ids, *args, **kwargs): - first = None - # Note, the code below that wraps single-value token_ids in a list - # is to work around this wonky behavior: - # >>> t.decode(13) - # '<0x0A>' - # >>> t.decode([13]) - # '\n' - # Not doing this causes token streaming to receive <0x0A> characters - # instead of newlines. - if isinstance(token_ids, int): - first = token_ids - token_ids = [first] - elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor - # Tensors don't support the Python standard of 'empty is False' - # and the special case of dimension 0 tensors also needs to be - # handled separately. - if token_ids.dim() == 0: - first = int(token_ids.item()) - token_ids = [first] - elif len(token_ids) > 0: - first = int(token_ids[0]) - elif token_ids is not None and len(token_ids) > 0: - first = token_ids[0] - result = original_decode(self, token_ids, *args, **kwargs) - if first is not None and first in has_prefix_space: - result = " " + result - return result - # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it - object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer)) - - # Wrap encode and __call__ to work around the 'prefix space' misfeature also. - # The problem is that "Bob" at the start of text is encoded as if it is - # " Bob". This creates a problem because it means you can't split text, encode - # the pieces, concatenate the tokens, decode them, and get the original text back. - # The workaround is to prepend a known token that (1) starts with a space; and - # (2) is not the prefix of any other token. After searching through the vocab - # " ," (space comma) is the only token containing only printable ascii characters - # that fits this bill. By prepending ',' to the text, the original encode - # method always returns [1919, ...], where the tail of the sequence is the - # actual encoded result we want without the prefix space behavior. - original_encode = type(self.tokenizer.tokenizer).encode - def encode_wrapper(self, text, *args, **kwargs): - if type(text) is str: - text = ',' + text - result = original_encode(self, text, *args, **kwargs) - result = result[1:] - else: - result = original_encode(self, text, *args, **kwargs) - return result - object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer)) - - # Since 'encode' is documented as being deprecated, also override __call__. - # This doesn't appear to currently be used by KoboldAI, but doing so - # in case someone uses it in the future. - original_call = type(self.tokenizer.tokenizer).__call__ - def call_wrapper(self, text, *args, **kwargs): - if type(text) is str: - text = ',' + text - result = original_call(self, text, *args, **kwargs) - result = result[1:] - else: - result = original_call(self, text, *args, **kwargs) - return result - object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer)) - - def unload(self): - self.model_config = None - - self.model = None - self.tokenizer = None - self.cache = None - self.generator = None - - self.model_name = "" - self.path = None - - with torch.no_grad(): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated") - for tensor in gc.get_objects(): - try: - if torch.is_tensor(tensor): - tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype)) - except: - pass - gc.collect() - try: - with torch.no_grad(): - torch.cuda.empty_cache() - except: - pass - - def _apply_warpers( - self, scores: torch.Tensor, input_ids: torch.Tensor - ) -> torch.Tensor: - warpers.update_settings() - - if LOG_SAMPLER_NO_EFFECT: - pre = torch.Tensor(scores) - - for sid in utils.koboldai_vars.sampler_order: - warper = Warper.from_id(sid) - - if not warper.value_is_valid(): - continue - - if warper == warpers.RepetitionPenalty: - # Rep pen needs more data than other samplers - scores = warper.torch(scores, input_ids=input_ids) - else: - scores = warper.torch(scores) - - assert scores is not None, f"Scores are None; warper '{warper}' is to blame" - - if LOG_SAMPLER_NO_EFFECT: - if torch.equal(pre, scores): - logger.info(warper, "had no effect on the scores.") - pre = torch.Tensor(scores) - return scores - - def _raw_generate( - self, - prompt_tokens: Union[List[int], torch.Tensor], - max_new: int, - gen_settings: GenerationSettings, - single_line: bool = False, - batch_count: int = 1, - seed: Optional[int] = None, - **kwargs, - ) -> GenerationResult: - if seed: - torch.manual_seed(seed) - - if not isinstance(prompt_tokens, torch.Tensor): - gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] - else: - gen_in = prompt_tokens - - self.generator.gen_begin_reuse(gen_in) - - for i in range(max_new): - logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache) - logits[:, :, self.tokenizer.bos_token_id] = -10000.0 - - logits = torch.unsqueeze(logits[0, -1, :], 0) - - scores = self._apply_warpers(logits, gen_in) - - scores = torch.softmax(scores, dim=-1) - - token = torch.multinomial(scores, 1) - - self.generator.gen_accept_token(token) - - self._post_token_gen(self.generator.sequence) - - utils.koboldai_vars.generated_tkns += 1 - - if token.item() == self.tokenizer.eos_token_id: break - - utils.koboldai_vars.generated_tkns = max_new - - return GenerationResult( - model=self, - out_batches=np.array( - self.generator.sequence[:, gen_in.size(1):], - ), - prompt=prompt_tokens, - is_whole_generation=True, - single_line=single_line, - ) - - def _get_model(self, location: str, tf_kwargs: Dict): - if not self.model_config: - ExLlamaConfig(os.path.join(location, "config.json")) - - _, self.model_config.model_path = load_model_gptq_settings(location) - # self.model_config.gpu_peer_fix = True - return ExLlama(self.model_config) - - def _get_tokenizer(self, location: str): - tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location)) - tokenizer._koboldai_header = tokenizer.encode("") - return tokenizer - - def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}): - requested_parameters = [] - gpu_count = torch.cuda.device_count() - layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None - requested_parameters.append({ - "uitype": "Valid Display", - "unit": "text", - "label": "Current Allocated Layers: %1/{}".format(layer_count), #%1 will be the validation value - "id": "valid_layers", - "max": layer_count, - "step": 1, - "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="}, - "menu_path": "Layers", - "extra_classes": "", - "refresh_model_inputs": False - }) - for i in range(gpu_count): - requested_parameters.append({ - "uitype": "slider", - "unit": "int", - "label": "{} Layers".format(torch.cuda.get_device_name(i)), - "id": "{}_Layers".format(i), - "min": 0, - "max": layer_count, - "step": 1, - "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="}, - "check_message": "The sum of assigned layers must equal {}".format(layer_count), - "default": [layer_count if i == 0 else 0], - "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)), - "menu_path": "Layers", - "extra_classes": "", - "refresh_model_inputs": False - }) - - requested_parameters.append({ - "uitype": "slider", - "unit": "int", - "label": "Maximum Context", - "id": "max_ctx", - "min": 2048, - "max": 16384, - "step": 512, - "default": 2048, - "tooltip": "The maximum context size the model supports", - "menu_path": "Configuration", - "extra_classes": "", - "refresh_model_inputs": False - }) - - requested_parameters.append({ - "uitype": "slider", - "unit": "float", - "label": "Embedding Compression", - "id": "compress_emb", - "min": 1, - "max": 8, - "step": 0.25, - "default": 1, - "tooltip": "If the model requires compressed embeddings, set them here", - "menu_path": "Configuration", - "extra_classes": "", - "refresh_model_inputs": False - }) - - requested_parameters.append({ - "uitype": "slider", - "unit": "float", - "label": "NTK alpha", - "id": "ntk_alpha", - "min": 1, - "max": 32, - "step": 0.25, - "default": 1, - "tooltip": "NTK alpha value", - "menu_path": "Configuration", - "extra_classes": "", - "refresh_model_inputs": False - }) - - return requested_parameters - - def set_input_parameters(self, parameters): - gpu_count = torch.cuda.device_count() - layers = [] - for i in range(gpu_count): - if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric(): - layers.append(int(parameters["{}_Layers".format(i)])) - elif isinstance(parameters["{}_Layers".format(i)], str): - layers.append(None) - else: - layers.append(parameters["{}_Layers".format(i)]) - - self.layers = layers - self.model_config.device_map.layers = [] - for i, l in enumerate(layers): - if l > 0: - self.model_config.device_map.layers.extend([f"cuda:{i}"] * l) - self.model_config.device_map.lm_head = "cuda:0" - self.model_config.device_map.norm = "cuda:0" - - self.model_config.max_seq_len = parameters["max_ctx"] - self.model_config.compress_pos_emb = parameters["compress_emb"] - self.model_config.alpha_value = parameters["ntk_alpha"] - self.model_config.calculate_rotary_embedding_base() - - # Disable half2 for HIP - self.model_config.rmsnorm_no_half2 = bool(torch.version.hip) - self.model_config.rope_no_half2 = bool(torch.version.hip) - self.model_config.matmul_no_half2 = bool(torch.version.hip) - self.model_config.silu_no_half2 = bool(torch.version.hip) - - # Disable scaled_dot_product_attention if torch version < 2 - if torch.__version__.startswith("1."): - self.model_config.sdp_thd = 0 - - self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id'] - self.path = parameters['path'] if 'path' in parameters else None