diff --git a/.gitmodules b/.gitmodules index 0107a8c3..c6f4b308 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,7 @@ [submodule "KoboldAI-Horde-Bridge"] path = KoboldAI-Horde-Bridge url = https://github.com/db0/KoboldAI-Horde-Bridge +[submodule "repos/gptq"] + path = repos/gptq + url = https://github.com/0cc4m/GPTQ-for-LLaMa + branch = a8303654c200c25577130466e5f9bc1e70fc8a50 diff --git a/README.md b/README.md index 789b78d1..0657fa0b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,36 @@ +## This is a fork of KoboldAI that implements 4bit GPTQ quantized support to include Llama. + +### Install/Use Guide +(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use) + +In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created. + +`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules` + +`cd KoboldAI` + +Next step, (Windows) subfolder mode or B: option doesn't matter choose either + +[if on Windows] `install_requirements.bat` if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory. + +[if on Linux] `install_requirements.sh` + +If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder) + +Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type). + +Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-g.pt` or `4bit-.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`) + +So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below). + +If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux) + +Run `play.bat` [windows] or `play.sh` [linux] + +Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on. + +The 4bit toggle shows when a model to load is selected. + ## KoboldAI - Your gateway to GPT writing This is a browser-based front-end for AI-assisted writing with multiple local & remote AI models. It offers the standard array of tools, including Memory, Author's Note, World Info, Save & Load, adjustable AI settings, formatting options, and the ability to import existing AI Dungeon adventures. You can also turn on Adventure mode and play the game like AI Dungeon Unleashed. diff --git a/aiserver.py b/aiserver.py index 6b243efb..7e9241f5 100644 --- a/aiserver.py +++ b/aiserver.py @@ -87,6 +87,18 @@ allowed_ips = set() # empty set enable_whitelist = False +# 4-bit dependencies +from pathlib import Path +import glob +sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) +from gptj import load_quant as gptj_load_quant +from gptneox import load_quant as gptneox_load_quant +from llama import load_quant as llama_load_quant +from opt import load_quant as opt_load_quant +from offload import load_quant_offload +monkey_patched_4bit = False + + if lupa.LUA_VERSION[:2] != (5, 4): logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.") @@ -1002,8 +1014,6 @@ def getmodelname(): def get_hidden_size_from_model(model): return model.get_input_embeddings().embedding_dim - - #==================================================================# # Allow the models to override some settings #==================================================================# @@ -1305,7 +1315,7 @@ def general_startup(override_args=None): parser.add_argument("--noaimenu", action='store_true', help="Disables the ability to select the AI") parser.add_argument("--ngrok", action='store_true', help="Optimizes KoboldAI for Remote Play using Ngrok") parser.add_argument("--localtunnel", action='store_true', help="Optimizes KoboldAI for Remote Play using Localtunnel") - parser.add_argument("--host", type=str, default="", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc") + parser.add_argument("--host", type=str, default="Disabled", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc") parser.add_argument("--port", type=int, help="Specify the port on which the application will be joinable") parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)") parser.add_argument("--model", help="Specify the Model Type to skip the Menu") @@ -1436,17 +1446,14 @@ def general_startup(override_args=None): if args.localtunnel: koboldai_vars.host = True; - if args.host == "": - koboldai_vars.host = True - args.unblock = True - if args.host: + if args.host != "Disabled": # This means --host option was submitted without an argument # Enable all LAN IPs (0.0.0.0/0) + koboldai_vars.host = True + args.unblock = True if args.host != "": # Check if --host option was submitted with an argument # Parse the supplied IP(s) and add them to the allowed IPs list - koboldai_vars.host = True - args.unblock = True enable_whitelist = True for ip_str in args.host.split(","): if "/" in ip_str: @@ -1463,6 +1470,7 @@ def general_startup(override_args=None): print(f"Allowed IPs: {allowed_ips}") + if args.cpu: koboldai_vars.use_colab_tpu = False @@ -1594,6 +1602,7 @@ def get_model_info(model, directory=""): 'break_values': break_values, 'gpu_count': gpu_count, 'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select, 'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False, + 'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False, 'show_custom_model_box': show_custom_model_box}) if send_horde_models: get_cluster_models({'key': key_value, 'url': default_url}) @@ -1767,9 +1776,59 @@ def unload_model(): #Reload our badwords koboldai_vars.badwordsids = koboldai_settings.badwordsids_default + + +def prepare_4bit_load(modelpath): + paths_4bit = ["4bit*.safetensors", "4bit*.pt"] + paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] + result = False + groupsize = -1 + for p in paths_4bit: + p = os.path.join(modelpath, p) + val = [v for v in glob.glob(p) if "4bit-old" not in v] + if val: + result = val[0] + fname = Path(result).parts[-1] + g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname) + if g: + groupsize = int(g[0]) + break + + global monkey_patched_4bit + + # Monkey-patch in old-format pt-file support + if not result: + print("4-bit file not found, falling back to old format.") + for p in paths_4bit_old: + p = os.path.join(modelpath, p) + if os.path.isfile(p): + result = p + break + + if not result: + print("4-bit old-format file not found, loading failed.") + raise RuntimeError(f"4-bit load failed. PT-File not found.") + + import llama, opt, gptneox, gptj, old_quant + llama.make_quant = old_quant.old_make_quant + opt.make_quant = old_quant.old_make_quant + gptneox.make_quant = old_quant.old_make_quant + gptj.make_quant = old_quant.old_make_quant + monkey_patched_4bit = True + elif monkey_patched_4bit: + # Undo monkey patch + print("Undoing 4-bit old format monkey patch") + import llama, opt, gptneox, gptj, quant + llama.make_quant = quant.make_quant + opt.make_quant = quant.make_quant + gptneox.make_quant = quant.make_quant + gptj.make_quant = quant.make_quant + monkey_patched_4bit = False + + return result, groupsize -def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False): +def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False): global model global tokenizer global model_config @@ -1807,7 +1866,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal disk_layers = args.breakmodel_disklayers if breakmodel_args_default_to_cpu and disk_layers is None: disk_layers = args.breakmodel_disklayers = 0 - + unload_model() if online_model == "": @@ -6472,7 +6531,7 @@ def UI_2_load_model(data): koboldai_vars.model = data['model'] koboldai_vars.custmodpth = data['path'] print("loading Model") - load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit']) + load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'], use_4_bit=data['use_4_bit']) #==================================================================# # Event triggered when load story is clicked diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 138993dd..daa25e1f 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -31,6 +31,7 @@ dependencies: - flask-cors - lupa==1.10 - transformers==4.28.0 + - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc - huggingface_hub==0.12.1 - safetensors - accelerate==0.18.0 diff --git a/environments/rocm.yml b/environments/rocm.yml index e28c86cb..35fc098d 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -30,9 +30,10 @@ dependencies: - flask-cors - lupa==1.10 - transformers==4.28.0 + - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc - huggingface_hub==0.12.1 - safetensors - - accelerate + - accelerate==0.18.0 - git+https://github.com/VE-FORBRYDERNE/mkultra - ansi2html - flask_compress diff --git a/install_requirements.bat b/install_requirements.bat index 2a4534c1..3b735ddf 100644 --- a/install_requirements.bat +++ b/install_requirements.bat @@ -48,6 +48,8 @@ umamba.exe create -r B:\python\ -n base umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy umamba.exe -r B:\ clean -a -y rd B:\Python\pkgs /S /Q +call B:\python\condabin\activate +pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" subst B: /d pause exit @@ -60,5 +62,7 @@ umamba.exe create -r miniconda3\ -n base umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy umamba.exe clean -a -y rd miniconda3\Python\pkgs /S /Q +call miniconda3\condabin\activate +pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl" pause exit diff --git a/install_requirements.sh b/install_requirements.sh index 6f0e0dfd..7b5a8d5b 100755 --- a/install_requirements.sh +++ b/install_requirements.sh @@ -5,6 +5,9 @@ wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar - bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y + +# Install quant_cuda module for 4-bit +bin/micromamba run -r runtime -n koboldai pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl exit fi if [[ $1 = "rocm" ]]; then diff --git a/koboldai_settings.py b/koboldai_settings.py index 5088f8b1..86dea169 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1211,7 +1211,7 @@ class system_settings(settings): 'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model', - 'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states'] + 'generating_image', 'bit_8_available', 'bit_4_available', 'host', 'hascuda', 'usegpu', 'rng_states'] settings_name = "system" def __init__(self, socketio, koboldai_var): self._socketio = socketio @@ -1305,6 +1305,8 @@ class system_settings(settings): elif torch.cuda.get_device_properties(device).major == 7 and torch.cuda.get_device_properties(device).minor >= 2: self.bit_8_available = True break + # Check if repos/gptq exists for 4-bit mode + self.bit_4_available = os.path.isdir("repos/gptq") self.seen_messages = [] diff --git a/repos/gptq b/repos/gptq new file mode 160000 index 00000000..50b22e2b --- /dev/null +++ b/repos/gptq @@ -0,0 +1 @@ +Subproject commit 50b22e2ba8ec0f5cf0dca719392a2ec5254e7228 diff --git a/requirements.txt b/requirements.txt index 13774e1f..6407303c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ markdown bleach==4.1.0 sentencepiece protobuf -accelerate +accelerate==0.18.0 flask-session==0.4.0 marshmallow>=3.13 apispec-webframeworks diff --git a/static/koboldai.js b/static/koboldai.js index cce66f80..89ee2ea1 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -1472,6 +1472,7 @@ function show_model_menu(data) { document.getElementById("modelurl").classList.add("hidden"); document.getElementById("use_gpu_div").classList.add("hidden"); document.getElementById("use_8_bit_div").classList.add("hidden"); + document.getElementById("use_4_bit_div").classList.add("hidden"); document.getElementById("modellayers").classList.add("hidden"); document.getElementById("oaimodel").classList.add("hidden"); var model_layer_bars = document.getElementById('model_layer_bars'); @@ -1646,6 +1647,14 @@ function selected_model_info(data) { document.getElementById("use_8_bit").checked = false; } + //hide or unhide 4 bit mode + if (data.bit_4_available) { + document.getElementById("use_4_bit_div").classList.remove("hidden"); + } else { + document.getElementById("use_4_bit_div").classList.add("hidden"); + document.getElementById("use_4_bit").checked = false; + } + //default URL loading if (data.default_url != null) { document.getElementById("modelurl").value = data.default_url; @@ -1815,7 +1824,7 @@ function selected_model_info(data) { } accept.disabled = false; - + set_4_bit_mode(invert=false); } function update_gpu_layers() { @@ -1876,7 +1885,8 @@ function load_model() { 'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(), 'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value, 'online_model': selected_models, - 'use_8_bit': document.getElementById('use_8_bit').checked}; + 'use_8_bit': document.getElementById('use_8_bit').checked, + 'use_4_bit': document.getElementById('use_4_bit').checked}; socket.emit("load_model", message); closePopups(); } @@ -3160,6 +3170,15 @@ function save_preset() { closePopups(); } +function set_4_bit_mode(invert=true) { + bit_4_status = document.getElementById("use_4_bit").checked; + if (invert) { + bit_4_status = !bit_4_status; + } +} + + + //--------------------------------------------General UI Functions------------------------------------ function set_ui_level(level) { for (classname of ['setting_container', 'setting_container_single', 'setting_container_single_wide', 'biasing', 'palette_area', 'advanced_theme']) { @@ -7301,4 +7320,4 @@ $el("#gamescreen").addEventListener("paste", function(event) { false, event.clipboardData.getData("text/plain") ); -}); \ No newline at end of file +}); diff --git a/templates/popups.html b/templates/popups.html index 44cf7cb6..804b1b9f 100644 --- a/templates/popups.html +++ b/templates/popups.html @@ -75,6 +75,10 @@
Use 8 bit mode
+ @@ -402,4 +406,4 @@ -
\ No newline at end of file +