Merge latestgptq branch onto one-some's model-structure-and-maybe-rwkv branch

2025-06-05 21:59:24 +02:00 · 2023-04-16 09:06:12 +02:00
parent f9fb5eba89 fff2385173
commit 05b1d36803
12 changed files with 150 additions and 19 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,7 @@
 [submodule "KoboldAI-Horde-Bridge"]
 	path = KoboldAI-Horde-Bridge
 	url = https://github.com/db0/KoboldAI-Horde-Bridge
+[submodule "repos/gptq"]
+	path = repos/gptq
+	url = https://github.com/0cc4m/GPTQ-for-LLaMa
+	branch = a8303654c200c25577130466e5f9bc1e70fc8a50
--- a/README.md
+++ b/README.md
@@ -1,3 +1,36 @@
+## This is a fork of KoboldAI that implements 4bit GPTQ quantized support to include Llama.
+
+### Install/Use Guide
+(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
+
+In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
+
+`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
+
+`cd KoboldAI`
+
+Next step, (Windows) subfolder mode or B: option doesn't matter choose either
+
+[if on Windows] `install_requirements.bat` if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
+
+[if on Linux] `install_requirements.sh`
+
+If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
+
+Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
+
+Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
+
+So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
+
+If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
+
+Run `play.bat` [windows] or `play.sh` [linux]
+
+Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on.
+
+The 4bit toggle shows when a model to load is selected.
+
 ## KoboldAI - Your gateway to GPT writing

 This is a browser-based front-end for AI-assisted writing with multiple local & remote AI models. It offers the standard array of tools, including Memory, Author's Note, World Info, Save & Load, adjustable AI settings, formatting options, and the ability to import existing AI Dungeon adventures. You can also turn on Adventure mode and play the game like AI Dungeon Unleashed.
--- a/aiserver.py
+++ b/aiserver.py
@@ -87,6 +87,18 @@ allowed_ips = set()  # empty set
 enable_whitelist = False


+# 4-bit dependencies
+from pathlib import Path
+import glob
+sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
+from gptj import load_quant as gptj_load_quant
+from gptneox import load_quant as gptneox_load_quant
+from llama import load_quant as llama_load_quant
+from opt import load_quant as opt_load_quant
+from offload import load_quant_offload
+monkey_patched_4bit = False
+
+
 if lupa.LUA_VERSION[:2] != (5, 4):
    logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")

@@ -1002,8 +1014,6 @@ def getmodelname():
 def get_hidden_size_from_model(model):
    return model.get_input_embeddings().embedding_dim

-
-
 #==================================================================#
 #  Allow the models to override some settings
 #==================================================================#
@@ -1305,7 +1315,7 @@ def general_startup(override_args=None):
    parser.add_argument("--noaimenu", action='store_true', help="Disables the ability to select the AI")
    parser.add_argument("--ngrok", action='store_true', help="Optimizes KoboldAI for Remote Play using Ngrok")
    parser.add_argument("--localtunnel", action='store_true', help="Optimizes KoboldAI for Remote Play using Localtunnel")
-    parser.add_argument("--host", type=str, default="", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc")
+    parser.add_argument("--host", type=str, default="Disabled", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc")
    parser.add_argument("--port", type=int, help="Specify the port on which the application will be joinable")
    parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)")
    parser.add_argument("--model", help="Specify the Model Type to skip the Menu")
@@ -1436,17 +1446,14 @@ def general_startup(override_args=None):
    if args.localtunnel:
        koboldai_vars.host = True;

-    if args.host == "":
-        koboldai_vars.host = True
-        args.unblock = True
-    if args.host:
+    if args.host != "Disabled":
            # This means --host option was submitted without an argument
            # Enable all LAN IPs (0.0.0.0/0)
+        koboldai_vars.host = True
+        args.unblock = True
        if args.host != "":
            # Check if --host option was submitted with an argument
            # Parse the supplied IP(s) and add them to the allowed IPs list
-            koboldai_vars.host = True
-            args.unblock = True
            enable_whitelist = True
            for ip_str in args.host.split(","):
                if "/" in ip_str:
@@ -1463,6 +1470,7 @@ def general_startup(override_args=None):
            print(f"Allowed IPs: {allowed_ips}")


+
    if args.cpu:
        koboldai_vars.use_colab_tpu = False

@@ -1594,6 +1602,7 @@ def get_model_info(model, directory=""):
                         'break_values': break_values, 'gpu_count': gpu_count,
                         'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select,
                         'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False,
+                         'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False,
                         'show_custom_model_box': show_custom_model_box})
    if send_horde_models:
        get_cluster_models({'key': key_value, 'url': default_url})
@@ -1769,7 +1778,57 @@ def unload_model():
    koboldai_vars.badwordsids = koboldai_settings.badwordsids_default


-def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False):
+def prepare_4bit_load(modelpath):
+    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
+    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
+    result = False
+    groupsize = -1
+    for p in paths_4bit:
+        p = os.path.join(modelpath, p)
+        val = [v for v in glob.glob(p) if "4bit-old" not in v]
+        if val:
+            result = val[0]
+            fname = Path(result).parts[-1]
+            g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname)
+            if g:
+                groupsize = int(g[0])
+            break
+
+    global monkey_patched_4bit
+
+    # Monkey-patch in old-format pt-file support
+    if not result:
+        print("4-bit file not found, falling back to old format.")
+        for p in paths_4bit_old:
+            p = os.path.join(modelpath, p)
+            if os.path.isfile(p):
+                result = p
+                break
+
+        if not result:
+            print("4-bit old-format file not found, loading failed.")
+            raise RuntimeError(f"4-bit load failed. PT-File not found.")
+
+        import llama, opt, gptneox, gptj, old_quant
+        llama.make_quant = old_quant.old_make_quant
+        opt.make_quant = old_quant.old_make_quant
+        gptneox.make_quant = old_quant.old_make_quant
+        gptj.make_quant = old_quant.old_make_quant
+        monkey_patched_4bit = True
+    elif monkey_patched_4bit:
+        # Undo monkey patch
+        print("Undoing 4-bit old format monkey patch")
+        import llama, opt, gptneox, gptj, quant
+        llama.make_quant = quant.make_quant
+        opt.make_quant = quant.make_quant
+        gptneox.make_quant = quant.make_quant
+        gptj.make_quant = quant.make_quant
+        monkey_patched_4bit = False
+
+    return result, groupsize
+    
+    
+def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
    global model
    global tokenizer
    global model_config
@@ -6472,7 +6531,7 @@ def UI_2_load_model(data):
    koboldai_vars.model = data['model']
    koboldai_vars.custmodpth = data['path']
    print("loading Model")
-    load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'])
+    load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'], use_4_bit=data['use_4_bit'])

 #==================================================================#
 # Event triggered when load story is clicked
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -31,6 +31,7 @@ dependencies:
    - flask-cors
    - lupa==1.10
    - transformers==4.28.0
+    - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
    - huggingface_hub==0.12.1
    - safetensors
    - accelerate==0.18.0
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -30,9 +30,10 @@ dependencies:
    - flask-cors
    - lupa==1.10
    - transformers==4.28.0
+    - git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
    - huggingface_hub==0.12.1
    - safetensors
-    - accelerate
+    - accelerate==0.18.0
    - git+https://github.com/VE-FORBRYDERNE/mkultra
    - ansi2html
    - flask_compress
--- a/install_requirements.bat
+++ b/install_requirements.bat
@@ -48,6 +48,8 @@ umamba.exe create -r B:\python\ -n base
 umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy
 umamba.exe -r B:\ clean -a -y
 rd B:\Python\pkgs /S /Q
+call B:\python\condabin\activate
+pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 subst B: /d
 pause
 exit
@@ -60,5 +62,7 @@ umamba.exe create -r miniconda3\ -n base
 umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy
 umamba.exe clean -a -y
 rd miniconda3\Python\pkgs /S /Q
+call miniconda3\condabin\activate
+pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
 pause
 exit
--- a/install_requirements.sh
+++ b/install_requirements.sh
@@ -5,6 +5,9 @@ wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
 # Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
 bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
+
+# Install quant_cuda module for 4-bit
+bin/micromamba run -r runtime -n koboldai pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl
 exit
 fi
 if [[ $1 = "rocm" ]]; then
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -1211,7 +1211,7 @@ class system_settings(settings):
                         'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 
                         'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 
                         'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model',
-                         'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states']
+                         'generating_image', 'bit_8_available', 'bit_4_available', 'host', 'hascuda', 'usegpu', 'rng_states']
    settings_name = "system"
    def __init__(self, socketio, koboldai_var):
        self._socketio = socketio
@@ -1305,6 +1305,8 @@ class system_settings(settings):
                    elif torch.cuda.get_device_properties(device).major == 7 and torch.cuda.get_device_properties(device).minor >= 2:
                        self.bit_8_available = True
                        break
+        # Check if repos/gptq exists for 4-bit mode
+        self.bit_4_available = os.path.isdir("repos/gptq")
        self.seen_messages = []
        
        
--- a/repos/gptq
+++ b/repos/gptq
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ markdown
 bleach==4.1.0
 sentencepiece
 protobuf
-accelerate
+accelerate==0.18.0
 flask-session==0.4.0
 marshmallow>=3.13
 apispec-webframeworks
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -1472,6 +1472,7 @@ function show_model_menu(data) {
 	document.getElementById("modelurl").classList.add("hidden");
 	document.getElementById("use_gpu_div").classList.add("hidden");
 	document.getElementById("use_8_bit_div").classList.add("hidden");
+	document.getElementById("use_4_bit_div").classList.add("hidden");
 	document.getElementById("modellayers").classList.add("hidden");
 	document.getElementById("oaimodel").classList.add("hidden");
 	var model_layer_bars = document.getElementById('model_layer_bars');
@@ -1646,6 +1647,14 @@ function selected_model_info(data) {
 		document.getElementById("use_8_bit").checked = false;
 	}
 	
+	//hide or unhide 4 bit mode
+	if (data.bit_4_available) {
+		document.getElementById("use_4_bit_div").classList.remove("hidden");
+	} else {
+		document.getElementById("use_4_bit_div").classList.add("hidden");
+		document.getElementById("use_4_bit").checked = false;
+	}
+
 	//default URL loading
 	if (data.default_url != null) {
 		document.getElementById("modelurl").value = data.default_url;
@@ -1815,7 +1824,7 @@ function selected_model_info(data) {
 	}
 	accept.disabled = false;
 	
-	
+	set_4_bit_mode(invert=false);
 }

 function update_gpu_layers() {
@@ -1876,7 +1885,8 @@ function load_model() {
 			   'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(), 
 			   'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value, 
 			   'online_model': selected_models,
-			   'use_8_bit': document.getElementById('use_8_bit').checked};
+			   'use_8_bit': document.getElementById('use_8_bit').checked,
+			   'use_4_bit': document.getElementById('use_4_bit').checked};
 	socket.emit("load_model", message);
 	closePopups();
 }
@@ -3160,6 +3170,15 @@ function save_preset() {
 	closePopups();
 }

+function set_4_bit_mode(invert=true) {
+	bit_4_status = document.getElementById("use_4_bit").checked;
+	if (invert) {
+		bit_4_status = !bit_4_status;
+	}
+}
+
+
+
 //--------------------------------------------General UI Functions------------------------------------
 function set_ui_level(level) {
 	for (classname of ['setting_container', 'setting_container_single', 'setting_container_single_wide', 'biasing', 'palette_area', 'advanced_theme']) {
--- a/templates/popups.html
+++ b/templates/popups.html
@@ -75,6 +75,10 @@
 				<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_8_bit" checked>
 				<div class="box-label">Use 8 bit mode</div>
 			</div>
+			<div class="box flex-push-right hidden" id=use_4_bit_div>
+				<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_4_bit" checked>
+				<div class="box-label">Use 4 bit mode</div>
+			</div>
 			<button type="button" class="btn popup_load_cancel_button action_button disabled" onclick="load_model()" id="btn_loadmodelaccept" disabled>Load</button>
 			<button type="button" class="btn popup_load_cancel_button" onclick='closePopups();' id="btn_loadmodelclose">Cancel</button>
 		</div>