Merge latestgptq branch onto one-some's model-structure-and-maybe-rwkv branch

This commit is contained in:
0cc4m
2023-04-16 09:06:12 +02:00
12 changed files with 150 additions and 19 deletions

4
.gitmodules vendored
View File

@@ -4,3 +4,7 @@
[submodule "KoboldAI-Horde-Bridge"]
path = KoboldAI-Horde-Bridge
url = https://github.com/db0/KoboldAI-Horde-Bridge
[submodule "repos/gptq"]
path = repos/gptq
url = https://github.com/0cc4m/GPTQ-for-LLaMa
branch = a8303654c200c25577130466e5f9bc1e70fc8a50

View File

@@ -1,3 +1,36 @@
## This is a fork of KoboldAI that implements 4bit GPTQ quantized support to include Llama.
### Install/Use Guide
(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
`cd KoboldAI`
Next step, (Windows) subfolder mode or B: option doesn't matter choose either
[if on Windows] `install_requirements.bat` if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
[if on Linux] `install_requirements.sh`
If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
Run `play.bat` [windows] or `play.sh` [linux]
Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on.
The 4bit toggle shows when a model to load is selected.
## KoboldAI - Your gateway to GPT writing
This is a browser-based front-end for AI-assisted writing with multiple local & remote AI models. It offers the standard array of tools, including Memory, Author's Note, World Info, Save & Load, adjustable AI settings, formatting options, and the ability to import existing AI Dungeon adventures. You can also turn on Adventure mode and play the game like AI Dungeon Unleashed.

View File

@@ -87,6 +87,18 @@ allowed_ips = set() # empty set
enable_whitelist = False
# 4-bit dependencies
from pathlib import Path
import glob
sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
from gptj import load_quant as gptj_load_quant
from gptneox import load_quant as gptneox_load_quant
from llama import load_quant as llama_load_quant
from opt import load_quant as opt_load_quant
from offload import load_quant_offload
monkey_patched_4bit = False
if lupa.LUA_VERSION[:2] != (5, 4):
logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
@@ -1002,8 +1014,6 @@ def getmodelname():
def get_hidden_size_from_model(model):
return model.get_input_embeddings().embedding_dim
#==================================================================#
# Allow the models to override some settings
#==================================================================#
@@ -1305,7 +1315,7 @@ def general_startup(override_args=None):
parser.add_argument("--noaimenu", action='store_true', help="Disables the ability to select the AI")
parser.add_argument("--ngrok", action='store_true', help="Optimizes KoboldAI for Remote Play using Ngrok")
parser.add_argument("--localtunnel", action='store_true', help="Optimizes KoboldAI for Remote Play using Localtunnel")
parser.add_argument("--host", type=str, default="", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc")
parser.add_argument("--host", type=str, default="Disabled", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc")
parser.add_argument("--port", type=int, help="Specify the port on which the application will be joinable")
parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)")
parser.add_argument("--model", help="Specify the Model Type to skip the Menu")
@@ -1436,17 +1446,14 @@ def general_startup(override_args=None):
if args.localtunnel:
koboldai_vars.host = True;
if args.host == "":
koboldai_vars.host = True
args.unblock = True
if args.host:
if args.host != "Disabled":
# This means --host option was submitted without an argument
# Enable all LAN IPs (0.0.0.0/0)
koboldai_vars.host = True
args.unblock = True
if args.host != "":
# Check if --host option was submitted with an argument
# Parse the supplied IP(s) and add them to the allowed IPs list
koboldai_vars.host = True
args.unblock = True
enable_whitelist = True
for ip_str in args.host.split(","):
if "/" in ip_str:
@@ -1463,6 +1470,7 @@ def general_startup(override_args=None):
print(f"Allowed IPs: {allowed_ips}")
if args.cpu:
koboldai_vars.use_colab_tpu = False
@@ -1594,6 +1602,7 @@ def get_model_info(model, directory=""):
'break_values': break_values, 'gpu_count': gpu_count,
'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select,
'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False,
'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False,
'show_custom_model_box': show_custom_model_box})
if send_horde_models:
get_cluster_models({'key': key_value, 'url': default_url})
@@ -1769,7 +1778,57 @@ def unload_model():
koboldai_vars.badwordsids = koboldai_settings.badwordsids_default
def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False):
def prepare_4bit_load(modelpath):
paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
result = False
groupsize = -1
for p in paths_4bit:
p = os.path.join(modelpath, p)
val = [v for v in glob.glob(p) if "4bit-old" not in v]
if val:
result = val[0]
fname = Path(result).parts[-1]
g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname)
if g:
groupsize = int(g[0])
break
global monkey_patched_4bit
# Monkey-patch in old-format pt-file support
if not result:
print("4-bit file not found, falling back to old format.")
for p in paths_4bit_old:
p = os.path.join(modelpath, p)
if os.path.isfile(p):
result = p
break
if not result:
print("4-bit old-format file not found, loading failed.")
raise RuntimeError(f"4-bit load failed. PT-File not found.")
import llama, opt, gptneox, gptj, old_quant
llama.make_quant = old_quant.old_make_quant
opt.make_quant = old_quant.old_make_quant
gptneox.make_quant = old_quant.old_make_quant
gptj.make_quant = old_quant.old_make_quant
monkey_patched_4bit = True
elif monkey_patched_4bit:
# Undo monkey patch
print("Undoing 4-bit old format monkey patch")
import llama, opt, gptneox, gptj, quant
llama.make_quant = quant.make_quant
opt.make_quant = quant.make_quant
gptneox.make_quant = quant.make_quant
gptj.make_quant = quant.make_quant
monkey_patched_4bit = False
return result, groupsize
def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
global model
global tokenizer
global model_config
@@ -6472,7 +6531,7 @@ def UI_2_load_model(data):
koboldai_vars.model = data['model']
koboldai_vars.custmodpth = data['path']
print("loading Model")
load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'])
load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'], use_4_bit=data['use_4_bit'])
#==================================================================#
# Event triggered when load story is clicked

View File

@@ -31,6 +31,7 @@ dependencies:
- flask-cors
- lupa==1.10
- transformers==4.28.0
- git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
- huggingface_hub==0.12.1
- safetensors
- accelerate==0.18.0

View File

@@ -30,9 +30,10 @@ dependencies:
- flask-cors
- lupa==1.10
- transformers==4.28.0
- git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
- huggingface_hub==0.12.1
- safetensors
- accelerate
- accelerate==0.18.0
- git+https://github.com/VE-FORBRYDERNE/mkultra
- ansi2html
- flask_compress

View File

@@ -48,6 +48,8 @@ umamba.exe create -r B:\python\ -n base
umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy
umamba.exe -r B:\ clean -a -y
rd B:\Python\pkgs /S /Q
call B:\python\condabin\activate
pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
subst B: /d
pause
exit
@@ -60,5 +62,7 @@ umamba.exe create -r miniconda3\ -n base
umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy
umamba.exe clean -a -y
rd miniconda3\Python\pkgs /S /Q
call miniconda3\condabin\activate
pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
pause
exit

View File

@@ -5,6 +5,9 @@ wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -
bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
# Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
# Install quant_cuda module for 4-bit
bin/micromamba run -r runtime -n koboldai pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl
exit
fi
if [[ $1 = "rocm" ]]; then

View File

@@ -1211,7 +1211,7 @@ class system_settings(settings):
'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy',
'serverstarted', 'inference_config', 'image_pipeline', 'summarizer',
'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model',
'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states']
'generating_image', 'bit_8_available', 'bit_4_available', 'host', 'hascuda', 'usegpu', 'rng_states']
settings_name = "system"
def __init__(self, socketio, koboldai_var):
self._socketio = socketio
@@ -1305,6 +1305,8 @@ class system_settings(settings):
elif torch.cuda.get_device_properties(device).major == 7 and torch.cuda.get_device_properties(device).minor >= 2:
self.bit_8_available = True
break
# Check if repos/gptq exists for 4-bit mode
self.bit_4_available = os.path.isdir("repos/gptq")
self.seen_messages = []

1
repos/gptq Submodule

Submodule repos/gptq added at 50b22e2ba8

View File

@@ -15,7 +15,7 @@ markdown
bleach==4.1.0
sentencepiece
protobuf
accelerate
accelerate==0.18.0
flask-session==0.4.0
marshmallow>=3.13
apispec-webframeworks

View File

@@ -1472,6 +1472,7 @@ function show_model_menu(data) {
document.getElementById("modelurl").classList.add("hidden");
document.getElementById("use_gpu_div").classList.add("hidden");
document.getElementById("use_8_bit_div").classList.add("hidden");
document.getElementById("use_4_bit_div").classList.add("hidden");
document.getElementById("modellayers").classList.add("hidden");
document.getElementById("oaimodel").classList.add("hidden");
var model_layer_bars = document.getElementById('model_layer_bars');
@@ -1646,6 +1647,14 @@ function selected_model_info(data) {
document.getElementById("use_8_bit").checked = false;
}
//hide or unhide 4 bit mode
if (data.bit_4_available) {
document.getElementById("use_4_bit_div").classList.remove("hidden");
} else {
document.getElementById("use_4_bit_div").classList.add("hidden");
document.getElementById("use_4_bit").checked = false;
}
//default URL loading
if (data.default_url != null) {
document.getElementById("modelurl").value = data.default_url;
@@ -1815,7 +1824,7 @@ function selected_model_info(data) {
}
accept.disabled = false;
set_4_bit_mode(invert=false);
}
function update_gpu_layers() {
@@ -1876,7 +1885,8 @@ function load_model() {
'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(),
'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value,
'online_model': selected_models,
'use_8_bit': document.getElementById('use_8_bit').checked};
'use_8_bit': document.getElementById('use_8_bit').checked,
'use_4_bit': document.getElementById('use_4_bit').checked};
socket.emit("load_model", message);
closePopups();
}
@@ -3160,6 +3170,15 @@ function save_preset() {
closePopups();
}
function set_4_bit_mode(invert=true) {
bit_4_status = document.getElementById("use_4_bit").checked;
if (invert) {
bit_4_status = !bit_4_status;
}
}
//--------------------------------------------General UI Functions------------------------------------
function set_ui_level(level) {
for (classname of ['setting_container', 'setting_container_single', 'setting_container_single_wide', 'biasing', 'palette_area', 'advanced_theme']) {

View File

@@ -75,6 +75,10 @@
<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_8_bit" checked>
<div class="box-label">Use 8 bit mode</div>
</div>
<div class="box flex-push-right hidden" id=use_4_bit_div>
<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_4_bit" checked>
<div class="box-label">Use 4 bit mode</div>
</div>
<button type="button" class="btn popup_load_cancel_button action_button disabled" onclick="load_model()" id="btn_loadmodelaccept" disabled>Load</button>
<button type="button" class="btn popup_load_cancel_button" onclick='closePopups();' id="btn_loadmodelclose">Cancel</button>
</div>