From 8c9ed5540626655870b6c8e79b5a838f6f012a91 Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Thu, 9 Mar 2023 22:36:45 -0500 Subject: [PATCH 01/47] Update aiserver.py --- aiserver.py | 63 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/aiserver.py b/aiserver.py index 7c60b04e..4174d1fa 100644 --- a/aiserver.py +++ b/aiserver.py @@ -87,6 +87,38 @@ from io import BytesIO global tpu_mtj_backend +from transformers.models.llama.tokenization_llama import LLaMATokenizer +from repos.gptq.gptq import * +from repos.gptq.modelutils import * +from repos.gptq.quant import * +def load_quant(model, checkpoint, wbits): + from transformers import LLaMAConfig, LLaMAForCausalLM + config = LLaMAConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = LLaMAForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['lm_head']: + if name in layers: + del layers[name] + make_quant(model, layers, wbits) + + print('Loading model ...') + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + print('Done.') + + return model + if lupa.LUA_VERSION[:2] != (5, 4): logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.") @@ -2886,7 +2918,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal @functools.lru_cache(maxsize=None) def get_original_key(key): - return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) + try: + return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) + except ValueError: + return key for key, value in model_dict.items(): original_key = get_original_key(key) @@ -3083,22 +3118,24 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time lowmem = {} if(os.path.isdir(koboldai_vars.custmodpth)): + tokenizer = LLaMATokenizer.from_pretrained(koboldai_vars.custmodpth) + # try: + # tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) + # except Exception as e: + # try: + # tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") + # except Exception as e: + # try: + # tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") + # except Exception as e: + # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) - except Exception as e: - try: - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - except Exception as e: - try: - tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - except Exception as e: - tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") - try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + # model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) From dcf9d37a00dc582618f10deef6d226f77018dc16 Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Fri, 10 Mar 2023 00:01:40 -0500 Subject: [PATCH 02/47] It just works. --- aiserver.py | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/aiserver.py b/aiserver.py index 4174d1fa..66aa7362 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1142,9 +1142,9 @@ def move_model_to_devices(model): if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel): if(koboldai_vars.usegpu): - model = model.half().to(koboldai_vars.gpu_device) + model = model.to(koboldai_vars.gpu_device) else: - model = model.to('cpu').float() + model = model.to('cpu') generator = model.generate return @@ -1172,7 +1172,6 @@ def move_model_to_devices(model): generator = model.generate return - model.half() gc.collect() if(hasattr(model, "transformer")): @@ -2983,10 +2982,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3) #print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) model_dict[key] = model_dict[key].materialize(f, map_location="cpu") - if model_dict[key].dtype is torch.float32: - koboldai_vars.fp32_model = True - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: - model_dict[key] = model_dict[key].to(torch.float16) + # if model_dict[key].dtype is torch.float32: + # koboldai_vars.fp32_model = True + # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: + # model_dict[key] = model_dict[key].to(torch.float16) if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16): model_dict[key] = model_dict[key].to(torch.float32) if device == "shared": @@ -3010,16 +3009,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if utils.offload_index: for name, tensor in utils.named_buffers: dtype = tensor.dtype - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): - dtype = torch.float16 - if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): - dtype = torch.float32 - if name in model_dict and model_dict[name].dtype is not dtype: - model_dict[name] = model_dict[name].to(dtype) - if tensor.dtype is not dtype: - tensor = tensor.to(dtype) - if name not in utils.offload_index: - accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) + # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): + # dtype = torch.float16 + # if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): + # dtype = torch.float32 + # if name in model_dict and model_dict[name].dtype is not dtype: + # model_dict[name] = model_dict[name].to(dtype) + # if tensor.dtype is not dtype: + # tensor = tensor.to(dtype) + # if name not in utils.offload_index: + # accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache") utils.bar.close() utils.bar = None @@ -3078,10 +3077,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.modeldim = get_hidden_size_from_model(model) # Is CUDA available? If so, use GPU, otherwise fall back to CPU if(koboldai_vars.hascuda and koboldai_vars.usegpu): - model = model.half().to(koboldai_vars.gpu_device) + model = model.to(koboldai_vars.gpu_device) generator = model.generate else: - model = model.to('cpu').float() + model = model.to('cpu') generator = model.generate patch_causallm(model) # Use the Generic implementation @@ -3131,7 +3130,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_30B_4BIT'], 4) + model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") @@ -3190,7 +3189,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal import shutil tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)): # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case - model = model.half() model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB") else: # For fp16 models, we can just copy the model files directly import transformers.configuration_utils @@ -3224,7 +3222,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.hascuda): if(koboldai_vars.usegpu): koboldai_vars.modeldim = get_hidden_size_from_model(model) - model = model.half().to(koboldai_vars.gpu_device) + model = model.to(koboldai_vars.gpu_device) generator = model.generate elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) koboldai_vars.modeldim = get_hidden_size_from_model(model) @@ -3236,7 +3234,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate else: - model = model.to('cpu').float() + model = model.to('cpu') koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): @@ -3244,7 +3242,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate else: - model.to('cpu').float() + model.to('cpu') koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate From 3f132ce45ba61f30015147bb0d9ba26647204332 Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Fri, 10 Mar 2023 03:26:09 -0500 Subject: [PATCH 03/47] Notify if LLAMA_4BIT env var not set --- aiserver.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 66aa7362..399ce434 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3130,7 +3130,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) + if os.environ.get('LLAMA_4BIT') is not None: + model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) + else: + raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before procedding.") + exit(1) + except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") From 1808b0d2eca42e30bee6edd6896744cfd6995ffc Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Sat, 11 Mar 2023 12:13:22 -0500 Subject: [PATCH 04/47] Another safety check for if model is not loaded --- aiserver.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 399ce434..3ec8f284 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3133,13 +3133,17 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if os.environ.get('LLAMA_4BIT') is not None: model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) else: - raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before procedding.") + raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.") exit(1) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") # model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + + if model is None: + raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load. Exiting.") + exit(1) elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) From bde31217f164a3aadc4282913012378a886d6058 Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Sat, 11 Mar 2023 12:15:58 -0500 Subject: [PATCH 05/47] improve model None check --- aiserver.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/aiserver.py b/aiserver.py index 3ec8f284..c14ac730 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3134,16 +3134,14 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) else: raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.") - exit(1) + + if model is None: + raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.") except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") # model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - - if model is None: - raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load. Exiting.") - exit(1) elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) From b3b454bbe4b4a479ec5703b99487bf00906975ac Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed, 15 Mar 2023 00:03:43 -0500 Subject: [PATCH 06/47] Update huggingface.yml --- environments/huggingface.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 222bb6ad..26e7e670 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -29,7 +29,8 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers==4.25.1 + - git+https://github.com/zphang/transformers@llama_push + - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc - huggingface_hub==0.12.1 - safetensors - accelerate From 5d17692c79a3642b7e1ae1c37e262cd47f449356 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 16 Mar 2023 05:19:47 +0000 Subject: [PATCH 07/47] Remove except Exception so that errors actually show up --- aiserver.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/aiserver.py b/aiserver.py index 77e31b63..40d9a4ba 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3128,20 +3128,15 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") # except Exception as e: # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") - try: - # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - if os.environ.get('LLAMA_4BIT') is not None: - model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) - else: - raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.") + # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) - if model is None: - raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.") + if os.environ.get('LLAMA_4BIT'): + model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) + else: + raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.") - except Exception as e: - if("out of memory" in traceback.format_exc().lower()): - raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - # model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + if model is None: + raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.") elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) From 60acf593160ce86118286ab0fa5c4ce082ddc52c Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 19 Mar 2023 21:19:02 +0000 Subject: [PATCH 08/47] Improve 4-bit llama support, add 4-bit gptj and gptneox support --- aiserver.py | 86 +++++++++++++++++++++++++---------------------------- 1 file changed, 40 insertions(+), 46 deletions(-) diff --git a/aiserver.py b/aiserver.py index 40d9a4ba..96ea7490 100644 --- a/aiserver.py +++ b/aiserver.py @@ -87,37 +87,14 @@ from io import BytesIO global tpu_mtj_backend -from transformers.models.llama.tokenization_llama import LLaMATokenizer -from repos.gptq.gptq import * -from repos.gptq.modelutils import * -from repos.gptq.quant import * -def load_quant(model, checkpoint, wbits): - from transformers import LLaMAConfig, LLaMAForCausalLM - config = LLaMAConfig.from_pretrained(model) - def noop(*args, **kwargs): - pass - torch.nn.init.kaiming_uniform_ = noop - torch.nn.init.uniform_ = noop - torch.nn.init.normal_ = noop - torch.set_default_dtype(torch.half) - transformers.modeling_utils._init_weights = False - torch.set_default_dtype(torch.half) - model = LLaMAForCausalLM(config) - torch.set_default_dtype(torch.float) - model = model.eval() - layers = find_layers(model) - for name in ['lm_head']: - if name in layers: - del layers[name] - make_quant(model, layers, wbits) - - print('Loading model ...') - model.load_state_dict(torch.load(checkpoint)) - model.seqlen = 2048 - print('Done.') - - return model +# 4-bit dependencies +from pathlib import Path +sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) +from gptj import load_quant as gptj_load_quant +from gptneox import load_quant as gptneox_load_quant +from llama import load_quant as llama_load_quant +vars_4bit = {} if lupa.LUA_VERSION[:2] != (5, 4): @@ -1541,6 +1518,11 @@ def general_startup(override_args=None): parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen") parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen") + # 4-bit stuff + parser.add_argument('--gptj4bit', help="Load a GPT-J model 4-bit pt file with this path") + parser.add_argument('--gptneox4bit', help="Load a GPT-NeoX model 4-bit pt file with this path") + parser.add_argument('--llama4bit', help="Load a Llama model 4-bit pt file with this path") + #args: argparse.Namespace = None if "pytest" in sys.modules and override_args is None: args = parser.parse_args([]) @@ -1644,6 +1626,11 @@ def general_startup(override_args=None): koboldai_vars.smanrename = koboldai_vars.host == args.override_rename koboldai_vars.aria2_port = args.aria2_port or 6799 + + global vars_4bit + vars_4bit["gptj4bit"] = args.gptj4bit + vars_4bit["gptneox4bit"] = args.gptneox4bit + vars_4bit["llama4bit"] = args.llama4bit #Now let's look to see if we are going to force a load of a model from a user selected folder if(koboldai_vars.model == "selectfolder"): @@ -2971,7 +2958,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal try: f = z.open(f"archive/data/{storage_key}") except: - f = z.open(f"{zipfolder}/data/{storage_key}") + ziproot = z.namelist()[0].split(os.sep)[0] + f = z.open(f"{ziproot}/data/{storage_key}") current_offset = 0 if current_offset != model_dict[key].seek_offset: f.read(model_dict[key].seek_offset - current_offset) @@ -3117,23 +3105,29 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time lowmem = {} if(os.path.isdir(koboldai_vars.custmodpth)): - tokenizer = LLaMATokenizer.from_pretrained(koboldai_vars.custmodpth) - # try: - # tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) - # except Exception as e: - # try: - # tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - # except Exception as e: - # try: - # tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") - # except Exception as e: - # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") - # model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + global vars_4bit - if os.environ.get('LLAMA_4BIT'): - model = load_quant(koboldai_vars.custmodpth, os.environ['LLAMA_4BIT'], 4) + if vars_4bit.get("gptj4bit"): + model = gptj_load_quant(koboldai_vars.custmodpth, vars_4bit["gptj4bit"], 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) + elif vars_4bit.get("gptneox4bit"): + model = gptneox_load_quant(koboldai_vars.custmodpth, vars_4bit["gptneox4bit"], 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) + elif vars_4bit.get("llama4bit"): + model = llama_load_quant(koboldai_vars.custmodpth, vars_4bit["llama4bit"], 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) else: - raise RuntimeError("It looks like your environment variable for LLAMA_4BIT is not set (the model path).\nPlease set this variable before proceeding.") + try: + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) + except Exception as e: + try: + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") + except Exception as e: + try: + tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") + except Exception as e: + tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) if model is None: raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.") From 858657f6691933ad3660660001837491b7ba4ae6 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 20 Mar 2023 09:16:30 +0100 Subject: [PATCH 09/47] Fix zipfile folder identification fix for Windows --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 96ea7490..4558ce3d 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2958,7 +2958,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal try: f = z.open(f"archive/data/{storage_key}") except: - ziproot = z.namelist()[0].split(os.sep)[0] + ziproot = z.namelist()[0].split("/")[0] f = z.open(f"{ziproot}/data/{storage_key}") current_offset = 0 if current_offset != model_dict[key].seek_offset: From 4cfc1219d449ebc92205eed15f0ffc1b133db708 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 20 Mar 2023 19:13:46 +0000 Subject: [PATCH 10/47] Add gptq as submodule --- .gitmodules | 4 ++++ repos/gptq | 1 + 2 files changed, 5 insertions(+) create mode 160000 repos/gptq diff --git a/.gitmodules b/.gitmodules index 0107a8c3..c6f4b308 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,7 @@ [submodule "KoboldAI-Horde-Bridge"] path = KoboldAI-Horde-Bridge url = https://github.com/db0/KoboldAI-Horde-Bridge +[submodule "repos/gptq"] + path = repos/gptq + url = https://github.com/0cc4m/GPTQ-for-LLaMa + branch = a8303654c200c25577130466e5f9bc1e70fc8a50 diff --git a/repos/gptq b/repos/gptq new file mode 160000 index 00000000..a8303654 --- /dev/null +++ b/repos/gptq @@ -0,0 +1 @@ +Subproject commit a8303654c200c25577130466e5f9bc1e70fc8a50 From ecd065a881d40996558ff07d0e2bfdbdf255e777 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 21 Mar 2023 21:40:59 +0000 Subject: [PATCH 11/47] Overhaul 4-bit support to load with a toggle --- aiserver.py | 145 +++++++++++++++++++++++++++--------------- koboldai_settings.py | 6 +- static/koboldai.js | 32 +++++++++- templates/popups.html | 6 +- 4 files changed, 130 insertions(+), 59 deletions(-) diff --git a/aiserver.py b/aiserver.py index f58d949a..7497dfb9 100644 --- a/aiserver.py +++ b/aiserver.py @@ -70,7 +70,7 @@ from utils import debounce import utils import koboldai_settings import torch -from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, modeling_utils, AutoModelForTokenClassification +from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, modeling_utils, AutoModelForTokenClassification, LlamaTokenizer from transformers import __version__ as transformers_version import transformers try: @@ -1114,14 +1114,20 @@ def device_config(config): koboldai_vars.usegpu = False return -def move_model_to_devices(model): +def move_model_to_devices(model, use_4_bit=False): global generator if(not utils.HAS_ACCELERATE and not koboldai_vars.breakmodel): if(koboldai_vars.usegpu): - model = model.to(koboldai_vars.gpu_device) + if not use_4_bit: + model = model.half().to(koboldai_vars.gpu_device) + else: + model = model.to(koboldai_vars.gpu_device) else: - model = model.to('cpu') + if not use_4_bit: + model = model.to('cpu').float() + else: + model = model.to('cpu') generator = model.generate return @@ -1149,6 +1155,8 @@ def move_model_to_devices(model): generator = model.generate return + if not use_4_bit: + model.half() gc.collect() if(hasattr(model, "transformer")): @@ -1518,11 +1526,6 @@ def general_startup(override_args=None): parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen") parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen") - # 4-bit stuff - parser.add_argument('--gptj4bit', help="Load a GPT-J model 4-bit pt file with this path") - parser.add_argument('--gptneox4bit', help="Load a GPT-NeoX model 4-bit pt file with this path") - parser.add_argument('--llama4bit', help="Load a Llama model 4-bit pt file with this path") - #args: argparse.Namespace = None if "pytest" in sys.modules and override_args is None: args = parser.parse_args([]) @@ -1626,11 +1629,6 @@ def general_startup(override_args=None): koboldai_vars.smanrename = koboldai_vars.host == args.override_rename koboldai_vars.aria2_port = args.aria2_port or 6799 - - global vars_4bit - vars_4bit["gptj4bit"] = args.gptj4bit - vars_4bit["gptneox4bit"] = args.gptneox4bit - vars_4bit["llama4bit"] = args.llama4bit #Now let's look to see if we are going to force a load of a model from a user selected folder if(koboldai_vars.model == "selectfolder"): @@ -1777,6 +1775,7 @@ def get_model_info(model, directory=""): 'break_values': break_values, 'gpu_count': gpu_count, 'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select, 'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False, + 'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False, 'show_custom_model_box': show_custom_model_box}) if send_horde_models: get_cluster_models({'key': key_value, 'url': default_url}) @@ -1918,6 +1917,18 @@ def get_cluster_models(msg): emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2") +@socketio.on("use_4_bit_toggle") +def use_4_bit_toggle(msg): + # Disable lazy_load and breakmodel + if msg["use_4_bit"]: + koboldai_vars.lazy_load = False + koboldai_vars.nobreakmodel = True + else: + koboldai_vars.lazy_load = True + koboldai_vars.nobreakmodel = False + + # TODO: Reload JS values for this stuff + # Function to patch transformers to use our soft prompt def patch_causallm(model): from torch.nn import Embedding @@ -2647,7 +2658,7 @@ def unload_model(): koboldai_vars.badwordsids = koboldai_settings.badwordsids_default -def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False): +def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False): global model global generator global torch @@ -2684,7 +2695,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal disk_layers = args.breakmodel_disklayers if breakmodel_args_default_to_cpu and disk_layers is None: disk_layers = args.breakmodel_disklayers = 0 - + unload_model() if online_model == "": @@ -2904,10 +2915,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal @functools.lru_cache(maxsize=None) def get_original_key(key): - try: - return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) - except ValueError: - return key + # try: + return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) + # except ValueError: + # return key for key, value in model_dict.items(): original_key = get_original_key(key) @@ -2970,10 +2981,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3) #print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) model_dict[key] = model_dict[key].materialize(f, map_location="cpu") - # if model_dict[key].dtype is torch.float32: - # koboldai_vars.fp32_model = True - # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: - # model_dict[key] = model_dict[key].to(torch.float16) + if not use_4_bit: + if model_dict[key].dtype is torch.float32: + koboldai_vars.fp32_model = True + if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: + model_dict[key] = model_dict[key].to(torch.float16) if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16): model_dict[key] = model_dict[key].to(torch.float32) if device == "shared": @@ -2997,16 +3009,17 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if utils.offload_index: for name, tensor in utils.named_buffers: dtype = tensor.dtype - # if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): - # dtype = torch.float16 - # if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): - # dtype = torch.float32 - # if name in model_dict and model_dict[name].dtype is not dtype: - # model_dict[name] = model_dict[name].to(dtype) - # if tensor.dtype is not dtype: - # tensor = tensor.to(dtype) - # if name not in utils.offload_index: - # accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) + if not use_4_bit: + if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): + dtype = torch.float16 + if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): + dtype = torch.float32 + if name in model_dict and model_dict[name].dtype is not dtype: + model_dict[name] = model_dict[name].to(dtype) + if tensor.dtype is not dtype: + tensor = tensor.to(dtype) + if name not in utils.offload_index: + accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache") utils.bar.close() utils.bar = None @@ -3065,10 +3078,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.modeldim = get_hidden_size_from_model(model) # Is CUDA available? If so, use GPU, otherwise fall back to CPU if(koboldai_vars.hascuda and koboldai_vars.usegpu): - model = model.to(koboldai_vars.gpu_device) + if not use_4_bit: + model = model.half().to(koboldai_vars.gpu_device) + else: + model = model.to(koboldai_vars.gpu_device) generator = model.generate else: - model = model.to('cpu') + if not use_4_bit: + model = model.to('cpu').float() + else: + model = model.to('cpu') generator = model.generate patch_causallm(model) # Use the Generic implementation @@ -3105,17 +3124,26 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time lowmem = {} if(os.path.isdir(koboldai_vars.custmodpth)): - global vars_4bit - if vars_4bit.get("gptj4bit"): - model = gptj_load_quant(koboldai_vars.custmodpth, vars_4bit["gptj4bit"], 4) - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) - elif vars_4bit.get("gptneox4bit"): - model = gptneox_load_quant(koboldai_vars.custmodpth, vars_4bit["gptneox4bit"], 4) - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) - elif vars_4bit.get("llama4bit"): - model = llama_load_quant(koboldai_vars.custmodpth, vars_4bit["llama4bit"], 4) - tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) + path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") + + if not os.path.isfile(path_4bit): + print(f"4-bit file {path_4bit} not found, aborting 4-bit load") + use_4_bit = False + + if use_4_bit: + print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") + if koboldai_vars.model_type == "gptj": + model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) + elif koboldai_vars.model_type == "gpt_neox": + model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) + elif koboldai_vars.model_type == "llama": + model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) + else: + raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") else: try: tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) @@ -3185,6 +3213,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal import shutil tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) if(koboldai_vars.fp32_model and ("breakmodel" not in globals() or not breakmodel.disk_blocks)): # Use save_pretrained to convert fp32 models to fp16, unless we are using disk cache because save_pretrained is not supported in that case + if not use_4_bit: + model = model.half() model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB") else: # For fp16 models, we can just copy the model files directly import transformers.configuration_utils @@ -3218,27 +3248,36 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.hascuda): if(koboldai_vars.usegpu): koboldai_vars.modeldim = get_hidden_size_from_model(model) - model = model.to(koboldai_vars.gpu_device) + if not use_4_bit: + model = model.half().to(koboldai_vars.gpu_device) + else: + model = model.to(koboldai_vars.gpu_device) generator = model.generate elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) koboldai_vars.modeldim = get_hidden_size_from_model(model) if(not koboldai_vars.lazy_load): device_config(model.config) - move_model_to_devices(model) + move_model_to_devices(model, use_4_bit) elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): - move_model_to_devices(model) + move_model_to_devices(model, use_4_bit) koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate else: - model = model.to('cpu') + if not use_4_bit: + model.to('cpu').float() + else: + model.to('cpu') koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): - move_model_to_devices(model) + move_model_to_devices(model, use_4_bit) koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate else: - model.to('cpu') + if not use_4_bit: + model.to('cpu').float() + else: + model.to('cpu') koboldai_vars.modeldim = get_hidden_size_from_model(model) generator = model.generate @@ -8784,7 +8823,7 @@ def UI_2_load_model(data): koboldai_vars.model = data['model'] koboldai_vars.custmodpth = data['path'] print("loading Model") - load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit']) + load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'], use_4_bit=data['use_4_bit']) #==================================================================# # Event triggered when load story is clicked diff --git a/koboldai_settings.py b/koboldai_settings.py index 95caec0c..16cc8128 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1207,7 +1207,7 @@ class system_settings(settings): 'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model', - 'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states'] + 'generating_image', 'bit_8_available', 'bit_4_available', 'host', 'hascuda', 'usegpu', 'rng_states'] settings_name = "system" def __init__(self, socketio, koboldai_var): self._socketio = socketio @@ -1302,6 +1302,8 @@ class system_settings(settings): elif torch.cuda.get_device_properties(device).major == 7 and torch.cuda.get_device_properties(device).minor >= 2: self.bit_8_available = True break + # Check if repos/gptq exists for 4-bit mode + self.bit_4_available = os.path.isdir("repos/gptq") self.seen_messages = [] @@ -2744,4 +2746,4 @@ default_preset = { ] } badwordsids_default = [[6880], [50256], [42496], [4613], [17414], [22039], [16410], [27], [29], [38430], [37922], [15913], [24618], [28725], [58], [47175], [36937], [26700], [12878], [16471], [37981], [5218], [29795], [13412], [45160], [3693], [49778], [4211], [20598], [36475], [33409], [44167], [32406], [29847], [29342], [42669], [685], [25787], [7359], [3784], [5320], [33994], [33490], [34516], [43734], [17635], [24293], [9959], [23785], [21737], [28401], [18161], [26358], [32509], [1279], [38155], [18189], [26894], [6927], [14610], [23834], [11037], [14631], [26933], [46904], [22330], [25915], [47934], [38214], [1875], [14692], [41832], [13163], [25970], [29565], [44926], [19841], [37250], [49029], [9609], [44438], [16791], [17816], [30109], [41888], [47527], [42924], [23984], [49074], [33717], [31161], [49082], [30138], [31175], [12240], [14804], [7131], [26076], [33250], [3556], [38381], [36338], [32756], [46581], [17912], [49146]] # Tokenized array of badwords used to prevent AI artifacting -badwordsids_neox = [[0], [1], [44162], [9502], [12520], [31841], [36320], [49824], [34417], [6038], [34494], [24815], [26635], [24345], [3455], [28905], [44270], [17278], [32666], [46880], [7086], [43189], [37322], [17778], [20879], [49821], [3138], [14490], [4681], [21391], [26786], [43134], [9336], [683], [48074], [41256], [19181], [29650], [28532], [36487], [45114], [46275], [16445], [15104], [11337], [1168], [5647], [29], [27482], [44965], [43782], [31011], [42944], [47389], [6334], [17548], [38329], [32044], [35487], [2239], [34761], [7444], [1084], [12399], [18990], [17636], [39083], [1184], [35830], [28365], [16731], [43467], [47744], [1138], [16079], [40116], [45564], [18297], [42368], [5456], [18022], [42696], [34476], [23505], [23741], [39334], [37944], [45382], [38709], [33440], [26077], [43600], [34418], [36033], [6660], [48167], [48471], [15775], [19884], [41533], [1008], [31053], [36692], [46576], [20095], [20629], [31759], [46410], [41000], [13488], [30952], [39258], [16160], [27655], [22367], [42767], [43736], [49694], [13811], [12004], [46768], [6257], [37471], [5264], [44153], [33805], [20977], [21083], [25416], [14277], [31096], [42041], [18331], [33376], [22372], [46294], [28379], [38475], [1656], [5204], [27075], [50001], [16616], [11396], [7748], [48744], [35402], [28120], [41512], [4207], [43144], [14767], [15640], [16595], [41305], [44479], [38958], [18474], [22734], [30522], [46267], [60], [13976], [31830], [48701], [39822], [9014], [21966], [31422], [28052], [34607], [2479], [3851], [32214], [44082], [45507], [3001], [34368], [34758], [13380], [38363], [4299], [46802], [30996], [12630], [49236], [7082], [8795], [5218], [44740], [9686], [9983], [45301], [27114], [40125], [1570], [26997], [544], [5290], [49193], [23781], [14193], [40000], [2947], [43781], [9102], [48064], [42274], [18772], [49384], [9884], [45635], [43521], [31258], [32056], [47686], [21760], [13143], [10148], [26119], [44308], [31379], [36399], [23983], [46694], [36134], [8562], [12977], [35117], [28591], [49021], [47093], [28653], [29013], [46468], [8605], [7254], [25896], [5032], [8168], [36893], [38270], [20499], [27501], [34419], [29547], [28571], [36586], [20871], [30537], [26842], [21375], [31148], [27618], [33094], [3291], [31789], [28391], [870], [9793], [41361], [47916], [27468], [43856], [8850], [35237], [15707], [47552], [2730], [41449], [45488], [3073], [49806], [21938], [24430], [22747], [20924], [46145], [20481], [20197], [8239], [28231], [17987], [42804], [47269], [29972], [49884], [21382], [46295], [36676], [34616], [3921], [26991], [27720], [46265], [654], [9855], [40354], [5291], [34904], [44342], [2470], [14598], [880], [19282], [2498], [24237], [21431], [16369], [8994], [44524], [45662], [13663], [37077], [1447], [37786], [30863], [42854], [1019], [20322], [4398], [12159], [44072], [48664], [31547], [18736], [9259], [31], [16354], [21810], [4357], [37982], [5064], [2033], [32871], [47446], [62], [22158], [37387], [8743], [47007], [17981], [11049], [4622], [37916], [36786], [35138], [29925], [14157], [18095], [27829], [1181], [22226], [5709], [4725], [30189], [37014], [1254], [11380], [42989], [696], [24576], [39487], [30119], [1092], [8088], [2194], [9899], [14412], [21828], [3725], [13544], [5180], [44679], [34398], [3891], [28739], [14219], [37594], [49550], [11326], [6904], [17266], [5749], [10174], [23405], [9955], [38271], [41018], [13011], [48392], [36784], [24254], [21687], [23734], [5413], [41447], [45472], [10122], [17555], [15830], [47384], [12084], [31350], [47940], [11661], [27988], [45443], [905], [49651], [16614], [34993], [6781], [30803], [35869], [8001], [41604], [28118], [46462], [46762], [16262], [17281], [5774], [10943], [5013], [18257], [6750], [4713], [3951], [11899], [38791], [16943], [37596], [9318], [18413], [40473], [13208], [16375]] \ No newline at end of file +badwordsids_neox = [[0], [1], [44162], [9502], [12520], [31841], [36320], [49824], [34417], [6038], [34494], [24815], [26635], [24345], [3455], [28905], [44270], [17278], [32666], [46880], [7086], [43189], [37322], [17778], [20879], [49821], [3138], [14490], [4681], [21391], [26786], [43134], [9336], [683], [48074], [41256], [19181], [29650], [28532], [36487], [45114], [46275], [16445], [15104], [11337], [1168], [5647], [29], [27482], [44965], [43782], [31011], [42944], [47389], [6334], [17548], [38329], [32044], [35487], [2239], [34761], [7444], [1084], [12399], [18990], [17636], [39083], [1184], [35830], [28365], [16731], [43467], [47744], [1138], [16079], [40116], [45564], [18297], [42368], [5456], [18022], [42696], [34476], [23505], [23741], [39334], [37944], [45382], [38709], [33440], [26077], [43600], [34418], [36033], [6660], [48167], [48471], [15775], [19884], [41533], [1008], [31053], [36692], [46576], [20095], [20629], [31759], [46410], [41000], [13488], [30952], [39258], [16160], [27655], [22367], [42767], [43736], [49694], [13811], [12004], [46768], [6257], [37471], [5264], [44153], [33805], [20977], [21083], [25416], [14277], [31096], [42041], [18331], [33376], [22372], [46294], [28379], [38475], [1656], [5204], [27075], [50001], [16616], [11396], [7748], [48744], [35402], [28120], [41512], [4207], [43144], [14767], [15640], [16595], [41305], [44479], [38958], [18474], [22734], [30522], [46267], [60], [13976], [31830], [48701], [39822], [9014], [21966], [31422], [28052], [34607], [2479], [3851], [32214], [44082], [45507], [3001], [34368], [34758], [13380], [38363], [4299], [46802], [30996], [12630], [49236], [7082], [8795], [5218], [44740], [9686], [9983], [45301], [27114], [40125], [1570], [26997], [544], [5290], [49193], [23781], [14193], [40000], [2947], [43781], [9102], [48064], [42274], [18772], [49384], [9884], [45635], [43521], [31258], [32056], [47686], [21760], [13143], [10148], [26119], [44308], [31379], [36399], [23983], [46694], [36134], [8562], [12977], [35117], [28591], [49021], [47093], [28653], [29013], [46468], [8605], [7254], [25896], [5032], [8168], [36893], [38270], [20499], [27501], [34419], [29547], [28571], [36586], [20871], [30537], [26842], [21375], [31148], [27618], [33094], [3291], [31789], [28391], [870], [9793], [41361], [47916], [27468], [43856], [8850], [35237], [15707], [47552], [2730], [41449], [45488], [3073], [49806], [21938], [24430], [22747], [20924], [46145], [20481], [20197], [8239], [28231], [17987], [42804], [47269], [29972], [49884], [21382], [46295], [36676], [34616], [3921], [26991], [27720], [46265], [654], [9855], [40354], [5291], [34904], [44342], [2470], [14598], [880], [19282], [2498], [24237], [21431], [16369], [8994], [44524], [45662], [13663], [37077], [1447], [37786], [30863], [42854], [1019], [20322], [4398], [12159], [44072], [48664], [31547], [18736], [9259], [31], [16354], [21810], [4357], [37982], [5064], [2033], [32871], [47446], [62], [22158], [37387], [8743], [47007], [17981], [11049], [4622], [37916], [36786], [35138], [29925], [14157], [18095], [27829], [1181], [22226], [5709], [4725], [30189], [37014], [1254], [11380], [42989], [696], [24576], [39487], [30119], [1092], [8088], [2194], [9899], [14412], [21828], [3725], [13544], [5180], [44679], [34398], [3891], [28739], [14219], [37594], [49550], [11326], [6904], [17266], [5749], [10174], [23405], [9955], [38271], [41018], [13011], [48392], [36784], [24254], [21687], [23734], [5413], [41447], [45472], [10122], [17555], [15830], [47384], [12084], [31350], [47940], [11661], [27988], [45443], [905], [49651], [16614], [34993], [6781], [30803], [35869], [8001], [41604], [28118], [46462], [46762], [16262], [17281], [5774], [10943], [5013], [18257], [6750], [4713], [3951], [11899], [38791], [16943], [37596], [9318], [18413], [40473], [13208], [16375]] diff --git a/static/koboldai.js b/static/koboldai.js index cce66f80..05dcc47e 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -1472,6 +1472,7 @@ function show_model_menu(data) { document.getElementById("modelurl").classList.add("hidden"); document.getElementById("use_gpu_div").classList.add("hidden"); document.getElementById("use_8_bit_div").classList.add("hidden"); + document.getElementById("use_4_bit_div").classList.add("hidden"); document.getElementById("modellayers").classList.add("hidden"); document.getElementById("oaimodel").classList.add("hidden"); var model_layer_bars = document.getElementById('model_layer_bars'); @@ -1646,6 +1647,14 @@ function selected_model_info(data) { document.getElementById("use_8_bit").checked = false; } + //hide or unhide 4 bit mode + if (data.bit_4_available) { + document.getElementById("use_4_bit_div").classList.remove("hidden"); + } else { + document.getElementById("use_4_bit_div").classList.add("hidden"); + document.getElementById("use_4_bit").checked = false; + } + //default URL loading if (data.default_url != null) { document.getElementById("modelurl").value = data.default_url; @@ -1815,7 +1824,7 @@ function selected_model_info(data) { } accept.disabled = false; - + set_4_bit_mode(invert=false); } function update_gpu_layers() { @@ -1876,7 +1885,8 @@ function load_model() { 'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(), 'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value, 'online_model': selected_models, - 'use_8_bit': document.getElementById('use_8_bit').checked}; + 'use_8_bit': document.getElementById('use_8_bit').checked, + 'use_4_bit': document.getElementById('use_4_bit').checked}; socket.emit("load_model", message); closePopups(); } @@ -3160,6 +3170,22 @@ function save_preset() { closePopups(); } +function set_4_bit_mode(invert=true) { + bit_4_status = document.getElementById("use_4_bit").checked; + if (invert) { + bit_4_status = !bit_4_status; + } + if (bit_4_status) { + document.getElementById("modellayers").classList.add("hidden"); + socket.emit("use_4_bit_toggle", {"use_4_bit": false}); + } else { + document.getElementById("modellayers").classList.remove("hidden"); + socket.emit("use_4_bit_toggle", {"use_4_bit": true}); + } +} + + + //--------------------------------------------General UI Functions------------------------------------ function set_ui_level(level) { for (classname of ['setting_container', 'setting_container_single', 'setting_container_single_wide', 'biasing', 'palette_area', 'advanced_theme']) { @@ -7301,4 +7327,4 @@ $el("#gamescreen").addEventListener("paste", function(event) { false, event.clipboardData.getData("text/plain") ); -}); \ No newline at end of file +}); diff --git a/templates/popups.html b/templates/popups.html index 44cf7cb6..804b1b9f 100644 --- a/templates/popups.html +++ b/templates/popups.html @@ -75,6 +75,10 @@
Use 8 bit mode
+ @@ -402,4 +406,4 @@ -
\ No newline at end of file +
From c7edc764b95d44603e4d450d4326ce3628188ef3 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 21 Mar 2023 21:58:31 +0000 Subject: [PATCH 12/47] Fix llama loading --- aiserver.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/aiserver.py b/aiserver.py index 7497dfb9..967af85f 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2915,10 +2915,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal @functools.lru_cache(maxsize=None) def get_original_key(key): - # try: - return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) - # except ValueError: - # return key + try: + return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len) + except ValueError: + return key for key, value in model_dict.items(): original_key = get_original_key(key) @@ -3128,8 +3128,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") if not os.path.isfile(path_4bit): - print(f"4-bit file {path_4bit} not found, aborting 4-bit load") - use_4_bit = False + print(f"4-bit file {path_4bit} not found, loading failed") + raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") if use_4_bit: print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") @@ -3155,7 +3155,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal tokenizer = GPT2Tokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache") except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) if model is None: raise RuntimeError("Model returned 'None'. This is not expected to happen, but due to this, the model will not load.") From 8941428c66c377baa10aa95afd3186733dd92b89 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 22 Mar 2023 06:22:34 +0000 Subject: [PATCH 13/47] Fix Kobold loading to CPU in 4-bit, causing CUDA ASSERT error --- aiserver.py | 6 ++++-- repos/gptq | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index 967af85f..2c50cfcc 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3102,7 +3102,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors - if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel: + if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel and not use_4_bit: device_config(model_config) # Download model from Huggingface if it does not exist, otherwise load locally @@ -3133,6 +3133,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if use_4_bit: print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") + koboldai_vars.breakmodel = False + koboldai_vars.usegpu = True if koboldai_vars.model_type == "gptj": model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) @@ -3255,7 +3257,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal generator = model.generate elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) koboldai_vars.modeldim = get_hidden_size_from_model(model) - if(not koboldai_vars.lazy_load): + if(not koboldai_vars.lazy_load and not use_4_bit): device_config(model.config) move_model_to_devices(model, use_4_bit) elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): diff --git a/repos/gptq b/repos/gptq index a8303654..791cfe37 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit a8303654c200c25577130466e5f9bc1e70fc8a50 +Subproject commit 791cfe376af33aa01032dd52147050083a6345cf From 026eb3205e0f48dac5a4aa965d3e48d79ec5e1ab Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 22 Mar 2023 22:12:06 +0000 Subject: [PATCH 14/47] Fix 4-bit loading error when not loading in 4-bit --- aiserver.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/aiserver.py b/aiserver.py index 2c50cfcc..745a7cb8 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3125,13 +3125,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal lowmem = {} if(os.path.isdir(koboldai_vars.custmodpth)): - path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") - - if not os.path.isfile(path_4bit): - print(f"4-bit file {path_4bit} not found, loading failed") - raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") - if use_4_bit: + path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") + + if not os.path.isfile(path_4bit): + print(f"4-bit file {path_4bit} not found, loading failed") + raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") + print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") koboldai_vars.breakmodel = False koboldai_vars.usegpu = True From 9dcba3897810499786d1fb4b4bd8d41ef595a130 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 24 Mar 2023 19:07:28 +0000 Subject: [PATCH 15/47] Pin transformers to a working Llama-compatible version --- environments/huggingface.yml | 2 +- environments/rocm.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 42dda9c3..6807627e 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -30,7 +30,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - git+https://github.com/zphang/transformers@llama_push + - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24 - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc - huggingface_hub==0.12.1 - safetensors diff --git a/environments/rocm.yml b/environments/rocm.yml index 43fd331f..a1d3d8b0 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -29,7 +29,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers==4.25.1 + - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24 - huggingface_hub==0.12.1 - safetensors - accelerate From 2e7a8a1a66a3813ff2f68b5e37f659479f44afc2 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 23 Mar 2023 05:53:30 +0000 Subject: [PATCH 16/47] Adapt KoboldAI to latest gptq changes --- aiserver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index 745a7cb8..faee85c0 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3136,13 +3136,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.breakmodel = False koboldai_vars.usegpu = True if koboldai_vars.model_type == "gptj": - model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "gpt_neox": - model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "llama": - model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") From d1a2005a2710e0720fe2a863ebe4f5d1f9b2ad18 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 27 Mar 2023 20:45:21 +0000 Subject: [PATCH 17/47] Add support for old and new 4-bit format. Old one needs 4bit-old.pt file to launch --- aiserver.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/aiserver.py b/aiserver.py index faee85c0..fa2af0f3 100644 --- a/aiserver.py +++ b/aiserver.py @@ -94,7 +94,6 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) from gptj import load_quant as gptj_load_quant from gptneox import load_quant as gptneox_load_quant from llama import load_quant as llama_load_quant -vars_4bit = {} if lupa.LUA_VERSION[:2] != (5, 4): @@ -3127,9 +3126,29 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if use_4_bit: path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") + path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt") + + # Monkey-patch in old-format pt-file support + if not os.path.isfile(path_4bit): + print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}") + path_4bit = path_4bit_old + + import llama, opt, gptneox, gptj, old_quant, quant_cuda_old + llama.make_quant = old_quant.old_make_quant + opt.make_quant = old_quant.old_make_quant + gptneox.make_quant = old_quant.old_make_quant + gptj.make_quant = old_quant.old_make_quant + elif llama.make_quant == old_quant.old_make_quant: + # Undo monkey patch + import quant + llama.make_quant = quant.make_quant + opt.make_quant = quant.make_quant + gptneox.make_quant = quant.make_quant + gptj.make_quant = quant.make_quant + if not os.path.isfile(path_4bit): - print(f"4-bit file {path_4bit} not found, loading failed") + print(f"4-bit old-format file {path_4bit} not found, loading failed") raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") From 0f1fc46078f9a751e35c0c5e7e35d091a10f3f9b Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 27 Mar 2023 21:30:43 +0000 Subject: [PATCH 18/47] Fix errors during inference --- aiserver.py | 14 +++++++++++--- repos/gptq | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/aiserver.py b/aiserver.py index fa2af0f3..2c2eff1b 100644 --- a/aiserver.py +++ b/aiserver.py @@ -94,6 +94,7 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) from gptj import load_quant as gptj_load_quant from gptneox import load_quant as gptneox_load_quant from llama import load_quant as llama_load_quant +monkey_patched_4bit = False if lupa.LUA_VERSION[:2] != (5, 4): @@ -3128,23 +3129,28 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt") + global monkey_patched_4bit + # Monkey-patch in old-format pt-file support if not os.path.isfile(path_4bit): print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}") path_4bit = path_4bit_old - import llama, opt, gptneox, gptj, old_quant, quant_cuda_old + import llama, opt, gptneox, gptj, old_quant llama.make_quant = old_quant.old_make_quant opt.make_quant = old_quant.old_make_quant gptneox.make_quant = old_quant.old_make_quant gptj.make_quant = old_quant.old_make_quant - elif llama.make_quant == old_quant.old_make_quant: + monkey_patched_4bit = True + elif monkey_patched_4bit: # Undo monkey patch - import quant + print("Undoing 4-bit old format monkey patch") + import llama, opt, gptneox, gptj, quant llama.make_quant = quant.make_quant opt.make_quant = quant.make_quant gptneox.make_quant = quant.make_quant gptj.make_quant = quant.make_quant + monkey_patched_4bit = False if not os.path.isfile(path_4bit): @@ -3165,6 +3171,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") + + model = model.float() else: try: tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) diff --git a/repos/gptq b/repos/gptq index 791cfe37..0748a680 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 791cfe376af33aa01032dd52147050083a6345cf +Subproject commit 0748a680e95ab0a9f8860953a5d705a01070d1cc From ef6fe680a97efb740db946c0e4fbf5d2dd54889b Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 28 Mar 2023 06:30:02 +0000 Subject: [PATCH 19/47] Fix high VRAM usage caused by workaround for scalar type error --- aiserver.py | 2 +- repos/gptq | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aiserver.py b/aiserver.py index 2c2eff1b..27cafd59 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3172,7 +3172,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") - model = model.float() + model = model.half() else: try: tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) diff --git a/repos/gptq b/repos/gptq index 0748a680..5d07f25a 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 0748a680e95ab0a9f8860953a5d705a01070d1cc +Subproject commit 5d07f25a30f8602aedb3e69f11de07624e486ce9 From e698f22706c806e05fdd8c58f91f3d560bcba0d6 Mon Sep 17 00:00:00 2001 From: Digitous <107712289+Digitous@users.noreply.github.com> Date: Tue, 28 Mar 2023 19:14:46 -0400 Subject: [PATCH 20/47] Update README.md --- README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/README.md b/README.md index 20a1957a..c6e922aa 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,57 @@ +## This is a fork of KoboldAI that implements 4bit GPTQ quantized support to include Llama. + +### Install/Use Guide +(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use) + +In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created. + +git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules + +cd KoboldAI + +Next step, subfolder mode or B: option doesn't matter choose either + +[if on Windows] install_requirements.bat if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. + +[if on Linux] install_requirements.sh + + +[if on Windows] run commandline.bat + +[if on Linux] run commandline.sh + +commandline.bat/commandline.sh will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt). + + +cd repos + +cd gptq + + +[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] python setup_cuda.py install + +[if on Linux] python setup_cuda.py install + +After the Cuda kernel is compiled, return to KoboldAI base directory + +[if on Windows (only applies to windows users)] pip install flask_cors + +If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder) + +Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). + +Then move your model folder to KoboldAI/models, and rename the .pt in your model folder to 4bit.pt + +So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below). + +If you haven't done so already, exit the command prompt/leave KAI's (base) venv + +Run play.bat [windows] or play.sh [linux] + +Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on. + +The 4bit toggle shows when a model to load is selected. + ## KoboldAI - Your gateway to GPT writing This is a browser-based front-end for AI-assisted writing with multiple local & remote AI models. It offers the standard array of tools, including Memory, Author's Note, World Info, Save & Load, adjustable AI settings, formatting options, and the ability to import existing AI Dungeon adventures. You can also turn on Adventure mode and play the game like AI Dungeon Unleashed. From 8d008b87a608beb47e5f41473a40b437aa33d4b4 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 13:25:06 +0000 Subject: [PATCH 21/47] Add OPT support --- aiserver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/aiserver.py b/aiserver.py index 27cafd59..edce6bf1 100644 --- a/aiserver.py +++ b/aiserver.py @@ -94,6 +94,7 @@ sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) from gptj import load_quant as gptj_load_quant from gptneox import load_quant as gptneox_load_quant from llama import load_quant as llama_load_quant +from opt import load_quant as opt_load_quant monkey_patched_4bit = False @@ -3169,6 +3170,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal elif koboldai_vars.model_type == "llama": model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) + elif koboldai_vars.model_type == "opt": + model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") From f6f7687cc015821c4d4b4cff7dbfea1052514efb Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 14:47:59 +0000 Subject: [PATCH 22/47] Add 4bit safetensor support, improve loading code --- aiserver.py | 78 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/aiserver.py b/aiserver.py index edce6bf1..2679ddc8 100644 --- a/aiserver.py +++ b/aiserver.py @@ -90,6 +90,7 @@ global tpu_mtj_backend # 4-bit dependencies from pathlib import Path +import glob sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) from gptj import load_quant as gptj_load_quant from gptneox import load_quant as gptneox_load_quant @@ -2657,6 +2658,50 @@ def unload_model(): #Reload our badwords koboldai_vars.badwordsids = koboldai_settings.badwordsids_default + + +def prepare_4bit_load(modelpath): + paths_4bit = ["4bit.pt", "4bit.safetensors"] + paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] + result = False + for p in paths_4bit: + p = os.path.join(modelpath, p) + if os.path.isfile(p): + result = p + break + + global monkey_patched_4bit + + # Monkey-patch in old-format pt-file support + if not result: + print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}") + for p in paths_4bit_old: + p = os.path.join(modelpath, p) + if os.path.isfile(p): + result = p + break + + if not result: + print(f"4-bit old-format file {path_4bit} not found, loading failed") + raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") + + import llama, opt, gptneox, gptj, old_quant + llama.make_quant = old_quant.old_make_quant + opt.make_quant = old_quant.old_make_quant + gptneox.make_quant = old_quant.old_make_quant + gptj.make_quant = old_quant.old_make_quant + monkey_patched_4bit = True + elif monkey_patched_4bit: + # Undo monkey patch + print("Undoing 4-bit old format monkey patch") + import llama, opt, gptneox, gptj, quant + llama.make_quant = quant.make_quant + opt.make_quant = quant.make_quant + gptneox.make_quant = quant.make_quant + gptj.make_quant = quant.make_quant + monkey_patched_4bit = False + + return result def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False): @@ -3127,36 +3172,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(os.path.isdir(koboldai_vars.custmodpth)): if use_4_bit: - path_4bit = os.path.join(koboldai_vars.custmodpth, "4bit.pt") - path_4bit_old = os.path.join(koboldai_vars.custmodpth, "4bit-old.pt") - - global monkey_patched_4bit - - # Monkey-patch in old-format pt-file support - if not os.path.isfile(path_4bit): - print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}") - path_4bit = path_4bit_old - - import llama, opt, gptneox, gptj, old_quant - llama.make_quant = old_quant.old_make_quant - opt.make_quant = old_quant.old_make_quant - gptneox.make_quant = old_quant.old_make_quant - gptj.make_quant = old_quant.old_make_quant - monkey_patched_4bit = True - elif monkey_patched_4bit: - # Undo monkey patch - print("Undoing 4-bit old format monkey patch") - import llama, opt, gptneox, gptj, quant - llama.make_quant = quant.make_quant - opt.make_quant = quant.make_quant - gptneox.make_quant = quant.make_quant - gptj.make_quant = quant.make_quant - monkey_patched_4bit = False - - - if not os.path.isfile(path_4bit): - print(f"4-bit old-format file {path_4bit} not found, loading failed") - raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") + path_4bit = prepare_4bit_load(koboldai_vars.custmodpth) print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") koboldai_vars.breakmodel = False @@ -3171,7 +3187,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "opt": - model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4) + model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") From a0bc77042624571b878d734ebc41331f6f4d9342 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 19:49:05 +0000 Subject: [PATCH 23/47] Add basic groupsize support Write groupsize into filename, for example 4bit-128b.safetensors for groupsize 128 --- aiserver.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/aiserver.py b/aiserver.py index 2679ddc8..38805287 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2661,13 +2661,19 @@ def unload_model(): def prepare_4bit_load(modelpath): - paths_4bit = ["4bit.pt", "4bit.safetensors"] + paths_4bit = ["4bit*.safetensors", "4bit*.pt"] paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] result = False + groupsize = -1 for p in paths_4bit: p = os.path.join(modelpath, p) - if os.path.isfile(p): - result = p + val = glob.glob(p) + if val: + result = val[0] + fname = Path(result).parts[-1] + g = re.findall("^(?:4bit)(?:-)(\d+)(?:b-?)", fname) + if g: + groupsize = int(g[0]) break global monkey_patched_4bit @@ -2701,7 +2707,7 @@ def prepare_4bit_load(modelpath): gptj.make_quant = quant.make_quant monkey_patched_4bit = False - return result + return result, groupsize def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False): @@ -3172,22 +3178,23 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(os.path.isdir(koboldai_vars.custmodpth)): if use_4_bit: - path_4bit = prepare_4bit_load(koboldai_vars.custmodpth) + path_4bit, groupsize = prepare_4bit_load(koboldai_vars.custmodpth) + print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}") print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") koboldai_vars.breakmodel = False koboldai_vars.usegpu = True if koboldai_vars.model_type == "gptj": - model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) + model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "gpt_neox": - model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) + model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "llama": - model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) + model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "opt": - model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, -1) + model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") From 73d5ec0e5dd234852a66331b681734e8beb13781 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 20:07:26 +0000 Subject: [PATCH 24/47] Pull latest gptq-changes --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index 5d07f25a..6f80e1fd 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 5d07f25a30f8602aedb3e69f11de07624e486ce9 +Subproject commit 6f80e1fdd7232d66e45e02ebf00be4c5d5f933bc From 9d0477f5f73471995fa3e23789a0ac4aa9108b33 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 22:05:44 +0000 Subject: [PATCH 25/47] Fix bug where it picks old model despite new one available --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 38805287..812bc4a8 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2667,7 +2667,7 @@ def prepare_4bit_load(modelpath): groupsize = -1 for p in paths_4bit: p = os.path.join(modelpath, p) - val = glob.glob(p) + val = [v for v in glob.glob(p) if "4bit-old" not in v] if val: result = val[0] fname = Path(result).parts[-1] From 61b13604b6ad116561488ab146c3959f40d98099 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Thu, 30 Mar 2023 10:57:04 +0200 Subject: [PATCH 26/47] Fix bug in 4-bit load fallback --- aiserver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index 812bc4a8..fe0f9a8c 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2680,7 +2680,7 @@ def prepare_4bit_load(modelpath): # Monkey-patch in old-format pt-file support if not result: - print(f"4-bit file {path_4bit} not found, falling back to {path_4bit_old}") + print("4-bit file not found, falling back to old format.") for p in paths_4bit_old: p = os.path.join(modelpath, p) if os.path.isfile(p): @@ -2688,8 +2688,8 @@ def prepare_4bit_load(modelpath): break if not result: - print(f"4-bit old-format file {path_4bit} not found, loading failed") - raise RuntimeError(f"4-bit load failed. PT-File not found at {path_4bit}") + print("4-bit old-format file not found, loading failed.") + raise RuntimeError(f"4-bit load failed. PT-File not found.") import llama, opt, gptneox, gptj, old_quant llama.make_quant = old_quant.old_make_quant From aa2292b3a4dff467e9afaa3270d80fcda4c7994f Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 29 Mar 2023 21:43:49 +0000 Subject: [PATCH 27/47] Enable multi-gpu support --- aiserver.py | 50 ++++++++++++++++------------------------------ static/koboldai.js | 9 +-------- 2 files changed, 18 insertions(+), 41 deletions(-) diff --git a/aiserver.py b/aiserver.py index fe0f9a8c..7a4370c0 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1139,7 +1139,7 @@ def move_model_to_devices(model, use_4_bit=False): import accelerate.utils for key, value in model.state_dict().items(): target_dtype = torch.float32 if breakmodel.primary_device == "cpu" else torch.float16 - if(value.dtype is not target_dtype): + if(value.dtype not in (torch.bool, torch.int) and value.dtype is not target_dtype): accelerate.utils.set_module_tensor_to_device(model, key, target_dtype) disk_blocks = breakmodel.disk_blocks gpu_blocks = breakmodel.gpu_blocks @@ -1919,18 +1919,6 @@ def get_cluster_models(msg): emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2") -@socketio.on("use_4_bit_toggle") -def use_4_bit_toggle(msg): - # Disable lazy_load and breakmodel - if msg["use_4_bit"]: - koboldai_vars.lazy_load = False - koboldai_vars.nobreakmodel = True - else: - koboldai_vars.lazy_load = True - koboldai_vars.nobreakmodel = False - - # TODO: Reload JS values for this stuff - # Function to patch transformers to use our soft prompt def patch_causallm(model): from torch.nn import Embedding @@ -3033,11 +3021,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3) #print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) model_dict[key] = model_dict[key].materialize(f, map_location="cpu") - if not use_4_bit: - if model_dict[key].dtype is torch.float32: - koboldai_vars.fp32_model = True - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: - model_dict[key] = model_dict[key].to(torch.float16) + if model_dict[key].dtype is torch.float32: + koboldai_vars.fp32_model = True + if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: + model_dict[key] = model_dict[key].to(torch.float16) if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16): model_dict[key] = model_dict[key].to(torch.float32) if device == "shared": @@ -3061,17 +3048,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if utils.offload_index: for name, tensor in utils.named_buffers: dtype = tensor.dtype - if not use_4_bit: - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): - dtype = torch.float16 - if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): - dtype = torch.float32 - if name in model_dict and model_dict[name].dtype is not dtype: - model_dict[name] = model_dict[name].to(dtype) - if tensor.dtype is not dtype: - tensor = tensor.to(dtype) - if name not in utils.offload_index: - accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) + if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu): + dtype = torch.float16 + if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel): + dtype = torch.float32 + if name in model_dict and model_dict[name].dtype is not dtype: + model_dict[name] = model_dict[name].to(dtype) + if tensor.dtype is not dtype: + tensor = tensor.to(dtype) + if name not in utils.offload_index: + accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index) accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache") utils.bar.close() utils.bar = None @@ -3154,7 +3140,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors - if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel and not use_4_bit: + if (utils.HAS_ACCELERATE or koboldai_vars.lazy_load and koboldai_vars.hascuda and koboldai_vars.breakmodel) and not koboldai_vars.nobreakmodel: device_config(model_config) # Download model from Huggingface if it does not exist, otherwise load locally @@ -3182,8 +3168,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}") print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") - koboldai_vars.breakmodel = False - koboldai_vars.usegpu = True if koboldai_vars.model_type == "gptj": model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) @@ -3311,7 +3295,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal generator = model.generate elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) koboldai_vars.modeldim = get_hidden_size_from_model(model) - if(not koboldai_vars.lazy_load and not use_4_bit): + if(not koboldai_vars.lazy_load): device_config(model.config) move_model_to_devices(model, use_4_bit) elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0): diff --git a/static/koboldai.js b/static/koboldai.js index 05dcc47e..89ee2ea1 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -3173,14 +3173,7 @@ function save_preset() { function set_4_bit_mode(invert=true) { bit_4_status = document.getElementById("use_4_bit").checked; if (invert) { - bit_4_status = !bit_4_status; - } - if (bit_4_status) { - document.getElementById("modellayers").classList.add("hidden"); - socket.emit("use_4_bit_toggle", {"use_4_bit": false}); - } else { - document.getElementById("modellayers").classList.remove("hidden"); - socket.emit("use_4_bit_toggle", {"use_4_bit": true}); + bit_4_status = !bit_4_status; } } From 6eae4574793687b517c45e85e5fc178015c8d088 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 31 Mar 2023 15:36:03 +0200 Subject: [PATCH 28/47] Fix 4bit groupsize param letter Use g instead of b for groupsize name, for example 4bit-128g.safetensors --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 7a4370c0..e7c789ac 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2659,7 +2659,7 @@ def prepare_4bit_load(modelpath): if val: result = val[0] fname = Path(result).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\d+)(?:b-?)", fname) + g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname) if g: groupsize = int(g[0]) break From d3a5ca65057f4f7cf9a2998cd13e5e04de829df1 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 1 Apr 2023 08:52:08 +0000 Subject: [PATCH 29/47] Update gptq submodule to latest --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index 6f80e1fd..f4de1019 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 6f80e1fdd7232d66e45e02ebf00be4c5d5f933bc +Subproject commit f4de1019fedce779915e050a517f4cc8dee432dc From bf0c999412b48a6de6a174a33bce3f8b92df1e16 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 1 Apr 2023 14:19:51 +0200 Subject: [PATCH 30/47] Update GPTQ to support AMD --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index f4de1019..954b3218 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit f4de1019fedce779915e050a517f4cc8dee432dc +Subproject commit 954b32183adda2acd437a3ab0683a28ca3c7e4c9 From 110f8229c565a1ac64060e4e1785d4563920d4f4 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 1 Apr 2023 21:33:05 +0200 Subject: [PATCH 31/47] Add cudatoolkit-dev for compilation, compatible gcc 9 and update transformers to fix error in gptq --- environments/huggingface.yml | 5 ++++- environments/rocm.yml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 6807627e..71d26e9c 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -11,6 +11,9 @@ dependencies: - pytorch=1.11.* - python=3.8.* - cudatoolkit=11.1 + - cudatoolkit-dev=11.1 + - gcc=9.* + - gxx=9.* - eventlet=0.33.3 - dnspython=2.2.1 - markdown @@ -30,7 +33,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24 + - git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc - huggingface_hub==0.12.1 - safetensors diff --git a/environments/rocm.yml b/environments/rocm.yml index a1d3d8b0..dda2a2b2 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -29,7 +29,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - git+https://github.com/huggingface/transformers@88dae78f4d204428568f749e864ef5ba09da7d24 + - git+https://github.com/huggingface/transformers@c612628045822f909020f7eb6784c79700813eda - huggingface_hub==0.12.1 - safetensors - accelerate From 2729b7764047b7c1d35f7a20e5900d61147fe598 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 2 Apr 2023 10:32:19 +0200 Subject: [PATCH 32/47] Add offload.py adapted from llama_inference_offload.py, with multi-gpu support and some improvements. Not yet functional, and still just supports Llama --- aiserver.py | 17 +++++++++++++++-- repos/gptq | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index e7c789ac..82992461 100644 --- a/aiserver.py +++ b/aiserver.py @@ -96,6 +96,7 @@ from gptj import load_quant as gptj_load_quant from gptneox import load_quant as gptneox_load_quant from llama import load_quant as llama_load_quant from opt import load_quant as opt_load_quant +from offload import load_quant_offload monkey_patched_4bit = False @@ -3137,6 +3138,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.model_type == "gpt2"): lowmem = {} koboldai_vars.lazy_load = False # Also, lazy loader doesn't support GPT-2 models + + gpu_layers_list = [int(l) for l in gpu_layers.split(",")] + offload_4bit = use_4_bit and sum(gpu_layers_list) < utils.num_layers(model_config) + + if offload_4bit: + koboldai_vars.lazy_load = False # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors @@ -3175,7 +3182,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "llama": - model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + if offload_4bit: + model = load_quant_offload(llama_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "opt": model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) @@ -3286,7 +3296,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal patch_causallm(model) if(koboldai_vars.hascuda): - if(koboldai_vars.usegpu): + if offload_4bit: + koboldai_vars.modeldim = get_hidden_size_from_model(model) + generator = model.generate + elif(koboldai_vars.usegpu): koboldai_vars.modeldim = get_hidden_size_from_model(model) if not use_4_bit: model = model.half().to(koboldai_vars.gpu_device) diff --git a/repos/gptq b/repos/gptq index 954b3218..f8bc2886 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 954b32183adda2acd437a3ab0683a28ca3c7e4c9 +Subproject commit f8bc2886cb2e2aaa704ea02404c2ff3841eb6fcf From e742083703ea8111379492c75e62f9dfffd54a28 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 2 Apr 2023 11:17:29 +0200 Subject: [PATCH 33/47] Fix multi-gpu-offloading --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index f8bc2886..971a5785 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit f8bc2886cb2e2aaa704ea02404c2ff3841eb6fcf +Subproject commit 971a5785a356460f2073b0141da0c1e0b8fdcbf6 From c8d00b7a10fd48f31f9d3fc4f4010f5481c772d4 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 2 Apr 2023 18:36:31 +0200 Subject: [PATCH 34/47] Add CPU offloading support for GPT-NeoX, GPT-J and OPT --- aiserver.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index 82992461..2365f58b 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3144,6 +3144,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if offload_4bit: koboldai_vars.lazy_load = False + print("4-bit CPU offloader active") # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors @@ -3176,10 +3177,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal print(f"Trying to load {koboldai_vars.model_type} model in 4-bit") if koboldai_vars.model_type == "gptj": - model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + if offload_4bit: + model = load_quant_offload(gptj_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = gptj_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "gpt_neox": - model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + if offload_4bit: + model = load_quant_offload(gptneox_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = gptneox_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "llama": if offload_4bit: @@ -3188,7 +3195,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal model = llama_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = LlamaTokenizer.from_pretrained(koboldai_vars.custmodpth) elif koboldai_vars.model_type == "opt": - model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) + if offload_4bit: + model = load_quant_offload(opt_load_quant, koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = opt_load_quant(koboldai_vars.custmodpth, path_4bit, 4, groupsize) tokenizer = AutoTokenizer.from_pretrained(koboldai_vars.custmodpth) else: raise RuntimeError(f"4-bit load failed. Model type {koboldai_vars.model_type} not supported in 4-bit") From ec4177a6d6cf3549f3aebffc1a54b4799c506657 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 3 Apr 2023 06:50:36 +0200 Subject: [PATCH 35/47] Remove cudatoolkit-dev and gcc/gxx 9 from conda env because they didn't resolve on Windows --- environments/huggingface.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 71d26e9c..b48c2547 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -11,9 +11,6 @@ dependencies: - pytorch=1.11.* - python=3.8.* - cudatoolkit=11.1 - - cudatoolkit-dev=11.1 - - gcc=9.* - - gxx=9.* - eventlet=0.33.3 - dnspython=2.2.1 - markdown From b9df9b6f590388a8fc6139e25b1d1c24c21fac52 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 3 Apr 2023 20:27:17 +0200 Subject: [PATCH 36/47] Improve CPU offloading speed significantly when offloading less than half of the layers --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index 971a5785..e2f567e9 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 971a5785a356460f2073b0141da0c1e0b8fdcbf6 +Subproject commit e2f567e9332c2d92f1c26ea0f7c935fe7cd65655 From ce6761e74436298424d3ea7bb964bb243e8cd88a Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 4 Apr 2023 07:46:53 +0200 Subject: [PATCH 37/47] Fix issue causing expected scalar type Float but found Half RuntimeErrors --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index e2f567e9..08c5054d 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit e2f567e9332c2d92f1c26ea0f7c935fe7cd65655 +Subproject commit 08c5054d45b8c6277e74a35841570dc7b8cbc608 From 8b4375307c2e4ea1154125fea1e00ef8c1b38415 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 5 Apr 2023 21:10:40 +0200 Subject: [PATCH 38/47] Update file formatting section in guide --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c6e922aa..0296e876 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ If you haven't already done so, create a model folder with the same name as your Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). -Then move your model folder to KoboldAI/models, and rename the .pt in your model folder to 4bit.pt +Then move your model folder to KoboldAI/models, and rename the .pt/safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-g.pt` or `4bit-.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`) So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below). From 40092cc9faed0d225391699e4cada1b9fb043dff Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 5 Apr 2023 21:49:13 +0200 Subject: [PATCH 39/47] Improve guide formatting --- README.md | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 0296e876..e103bbff 100644 --- a/README.md +++ b/README.md @@ -5,48 +5,46 @@ In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created. -git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules +`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules` -cd KoboldAI +`cd KoboldAI` -Next step, subfolder mode or B: option doesn't matter choose either +Next step, (Windows) subfolder mode or B: option doesn't matter choose either -[if on Windows] install_requirements.bat if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. +[if on Windows] `install_requirements.bat` if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. -[if on Linux] install_requirements.sh +[if on Linux] `install_requirements.sh` -[if on Windows] run commandline.bat +[if on Windows] run `commandline.bat` -[if on Linux] run commandline.sh +[if on Linux] run `commandline.sh` -commandline.bat/commandline.sh will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt). +`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt). -cd repos +`cd repos` -cd gptq +`cd gptq` -[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] python setup_cuda.py install +[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] `python setup_cuda.py install` -[if on Linux] python setup_cuda.py install +[if on Linux] `python setup_cuda.py install` After the Cuda kernel is compiled, return to KoboldAI base directory -[if on Windows (only applies to windows users)] pip install flask_cors - If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder) -Put your 4bit quantized .pt in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). +Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). -Then move your model folder to KoboldAI/models, and rename the .pt/safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-g.pt` or `4bit-.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`) +Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-g.pt` or `4bit-.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`) So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below). If you haven't done so already, exit the command prompt/leave KAI's (base) venv -Run play.bat [windows] or play.sh [linux] +Run `play.bat` [windows] or `play.sh` [linux] Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on. From 636c4e5a5284fa2a11af7aba2fdf55426047eb0f Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 7 Apr 2023 11:48:57 +0200 Subject: [PATCH 40/47] Update gptq repo --- repos/gptq | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/gptq b/repos/gptq index 08c5054d..17c46a59 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 08c5054d45b8c6277e74a35841570dc7b8cbc608 +Subproject commit 17c46a59ff20da657e68f3267f853b0243b983bc From 7efd314428e0ad24b33fc9cd9ac19b45c6754e7b Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 7 Apr 2023 20:10:24 +0200 Subject: [PATCH 41/47] Improve guide --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 86661df3..f9be9660 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,11 @@ Next step, (Windows) subfolder mode or B: option doesn't matter choose either [if on Linux] run `commandline.sh` -`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment (as shown by (base) affixed to the prompt). - +`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment. +On Windows, this will show (base) at the start of the prompt line. +If it shows (base) on Linux, you most likely have a base conda environment that you need to deactivate (`conda deactivate`) +Then run `cd repos` `cd gptq` @@ -42,7 +44,7 @@ Then move your model folder to KoboldAI/models, and rename the .pt or .safetenso So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below). -If you haven't done so already, exit the command prompt/leave KAI's (base) venv +If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux) Run `play.bat` [windows] or `play.sh` [linux] From b628aec7194783da09035a3b8fe01f674df542ea Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 10 Apr 2023 22:37:16 +0200 Subject: [PATCH 42/47] Automatic installation of the quant_cuda module during install_requirements Kepler (K40+) and Maxwell support --- install_requirements.bat | 4 ++++ install_requirements.sh | 3 +++ repos/gptq | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/install_requirements.bat b/install_requirements.bat index 2a4534c1..05264259 100644 --- a/install_requirements.bat +++ b/install_requirements.bat @@ -49,6 +49,8 @@ umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\h umamba.exe -r B:\ clean -a -y rd B:\Python\pkgs /S /Q subst B: /d +call B:\python\condabin\activate +cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" pause exit @@ -60,5 +62,7 @@ umamba.exe create -r miniconda3\ -n base umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy umamba.exe clean -a -y rd miniconda3\Python\pkgs /S /Q +call miniconda3\condabin\activate +cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" pause exit diff --git a/install_requirements.sh b/install_requirements.sh index 6f0e0dfd..7b5a8d5b 100755 --- a/install_requirements.sh +++ b/install_requirements.sh @@ -5,6 +5,9 @@ wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar - bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y + +# Install quant_cuda module for 4-bit +bin/micromamba run -r runtime -n koboldai pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl exit fi if [[ $1 = "rocm" ]]; then diff --git a/repos/gptq b/repos/gptq index 17c46a59..50b22e2b 160000 --- a/repos/gptq +++ b/repos/gptq @@ -1 +1 @@ -Subproject commit 17c46a59ff20da657e68f3267f853b0243b983bc +Subproject commit 50b22e2ba8ec0f5cf0dca719392a2ec5254e7228 From 687d107d20345a0cc46bb069914d0ce6a3bcf43d Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 10 Apr 2023 22:46:12 +0200 Subject: [PATCH 43/47] Update README, remove steps that are no longer required --- README.md | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/README.md b/README.md index f9be9660..0657fa0b 100644 --- a/README.md +++ b/README.md @@ -15,27 +15,6 @@ Next step, (Windows) subfolder mode or B: option doesn't matter choose either [if on Linux] `install_requirements.sh` - -[if on Windows] run `commandline.bat` - -[if on Linux] run `commandline.sh` - -`commandline.bat/commandline.sh` will put you in KoboldAI's virtual environment. -On Windows, this will show (base) at the start of the prompt line. -If it shows (base) on Linux, you most likely have a base conda environment that you need to deactivate (`conda deactivate`) - -Then run -`cd repos` - -`cd gptq` - - -[if on Windows, Visual Studio 2019 must be installed with C++ compiler option] `python setup_cuda.py install` - -[if on Linux] `python setup_cuda.py install` - -After the Cuda kernel is compiled, return to KoboldAI base directory - If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder) Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). From 35f908e147fcac121bdafaf7ca4b751d8091f480 Mon Sep 17 00:00:00 2001 From: biscober <50845461+biscober@users.noreply.github.com> Date: Tue, 11 Apr 2023 02:37:48 +0000 Subject: [PATCH 44/47] Update install_requirements.bat (#7) * Update install_requirements.bat move command to dismount temp B drive to after pip install command which requires B drive to still be mounted * Update install_requirements.bat cmd /k not necessary * Update install_requirements.bat add quotes (probably not required but w/e) --- install_requirements.bat | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/install_requirements.bat b/install_requirements.bat index 05264259..3b735ddf 100644 --- a/install_requirements.bat +++ b/install_requirements.bat @@ -48,9 +48,9 @@ umamba.exe create -r B:\python\ -n base umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy umamba.exe -r B:\ clean -a -y rd B:\Python\pkgs /S /Q -subst B: /d call B:\python\condabin\activate -cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" +pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" +subst B: /d pause exit @@ -63,6 +63,6 @@ umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingf umamba.exe clean -a -y rd miniconda3\Python\pkgs /S /Q call miniconda3\condabin\activate -cmd /k "pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" +pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" pause exit From 3eda7269f72bfa954a13aebb9d965b9c7dad9e61 Mon Sep 17 00:00:00 2001 From: Henk Date: Sat, 15 Apr 2023 14:58:24 +0200 Subject: [PATCH 45/47] Fix incorrect host merge --- aiserver.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index 2ec6d817..59cfac0c 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1613,9 +1613,6 @@ def general_startup(override_args=None): if args.localtunnel: koboldai_vars.host = True; - if args.host == "": - koboldai_vars.host = True - args.unblock = True if args.host: # This means --host option was submitted without an argument # Enable all LAN IPs (0.0.0.0/0) From 67334bd69848bc8f3c00f1015f9f95170d2c98a3 Mon Sep 17 00:00:00 2001 From: Henk Date: Sat, 15 Apr 2023 17:45:00 +0200 Subject: [PATCH 46/47] Pin accelerate version --- environments/rocm.yml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/rocm.yml b/environments/rocm.yml index e28c86cb..c3e95496 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -32,7 +32,7 @@ dependencies: - transformers==4.28.0 - huggingface_hub==0.12.1 - safetensors - - accelerate + - accelerate==0.18.0 - git+https://github.com/VE-FORBRYDERNE/mkultra - ansi2html - flask_compress diff --git a/requirements.txt b/requirements.txt index c2a61ca6..23468656 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ markdown bleach==4.1.0 sentencepiece protobuf -accelerate +accelerate==0.18.0 flask-session==0.4.0 marshmallow>=3.13 apispec-webframeworks From b68860b3de1adef4e162834bd524c92e39dbe264 Mon Sep 17 00:00:00 2001 From: Henk Date: Sat, 15 Apr 2023 18:31:39 +0200 Subject: [PATCH 47/47] Workaround to make --host work again --- aiserver.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/aiserver.py b/aiserver.py index 59cfac0c..886a802e 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1482,7 +1482,7 @@ def general_startup(override_args=None): parser.add_argument("--noaimenu", action='store_true', help="Disables the ability to select the AI") parser.add_argument("--ngrok", action='store_true', help="Optimizes KoboldAI for Remote Play using Ngrok") parser.add_argument("--localtunnel", action='store_true', help="Optimizes KoboldAI for Remote Play using Localtunnel") - parser.add_argument("--host", type=str, default="", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc") + parser.add_argument("--host", type=str, default="Disabled", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc") parser.add_argument("--port", type=int, help="Specify the port on which the application will be joinable") parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)") parser.add_argument("--model", help="Specify the Model Type to skip the Menu") @@ -1613,14 +1613,14 @@ def general_startup(override_args=None): if args.localtunnel: koboldai_vars.host = True; - if args.host: + if args.host != "Disabled": # This means --host option was submitted without an argument # Enable all LAN IPs (0.0.0.0/0) + koboldai_vars.host = True + args.unblock = True if args.host != "": # Check if --host option was submitted with an argument # Parse the supplied IP(s) and add them to the allowed IPs list - koboldai_vars.host = True - args.unblock = True enable_whitelist = True for ip_str in args.host.split(","): if "/" in ip_str: @@ -1637,6 +1637,7 @@ def general_startup(override_args=None): print(f"Allowed IPs: {allowed_ips}") + if args.cpu: koboldai_vars.use_colab_tpu = False