mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Merge latestgptq branch onto one-some's model-structure-and-maybe-rwkv branch
This commit is contained in:
4
.gitmodules
vendored
4
.gitmodules
vendored
@@ -4,3 +4,7 @@
|
||||
[submodule "KoboldAI-Horde-Bridge"]
|
||||
path = KoboldAI-Horde-Bridge
|
||||
url = https://github.com/db0/KoboldAI-Horde-Bridge
|
||||
[submodule "repos/gptq"]
|
||||
path = repos/gptq
|
||||
url = https://github.com/0cc4m/GPTQ-for-LLaMa
|
||||
branch = a8303654c200c25577130466e5f9bc1e70fc8a50
|
||||
|
33
README.md
33
README.md
@@ -1,3 +1,36 @@
|
||||
## This is a fork of KoboldAI that implements 4bit GPTQ quantized support to include Llama.
|
||||
|
||||
### Install/Use Guide
|
||||
(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
|
||||
|
||||
In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
|
||||
|
||||
`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
|
||||
|
||||
`cd KoboldAI`
|
||||
|
||||
Next step, (Windows) subfolder mode or B: option doesn't matter choose either
|
||||
|
||||
[if on Windows] `install_requirements.bat` if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
|
||||
|
||||
[if on Linux] `install_requirements.sh`
|
||||
|
||||
If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
|
||||
|
||||
Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
|
||||
|
||||
Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
|
||||
|
||||
So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, so long as the 4-bit toggle switch is on, it'll load the quantized model (4-bit switch explained below).
|
||||
|
||||
If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
|
||||
|
||||
Run `play.bat` [windows] or `play.sh` [linux]
|
||||
|
||||
Switch to UI2, enable Experimental UI under the Interface tab, then load your model and be sure 4-bit toggle is on.
|
||||
|
||||
The 4bit toggle shows when a model to load is selected.
|
||||
|
||||
## KoboldAI - Your gateway to GPT writing
|
||||
|
||||
This is a browser-based front-end for AI-assisted writing with multiple local & remote AI models. It offers the standard array of tools, including Memory, Author's Note, World Info, Save & Load, adjustable AI settings, formatting options, and the ability to import existing AI Dungeon adventures. You can also turn on Adventure mode and play the game like AI Dungeon Unleashed.
|
||||
|
83
aiserver.py
83
aiserver.py
@@ -87,6 +87,18 @@ allowed_ips = set() # empty set
|
||||
enable_whitelist = False
|
||||
|
||||
|
||||
# 4-bit dependencies
|
||||
from pathlib import Path
|
||||
import glob
|
||||
sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
|
||||
from gptj import load_quant as gptj_load_quant
|
||||
from gptneox import load_quant as gptneox_load_quant
|
||||
from llama import load_quant as llama_load_quant
|
||||
from opt import load_quant as opt_load_quant
|
||||
from offload import load_quant_offload
|
||||
monkey_patched_4bit = False
|
||||
|
||||
|
||||
if lupa.LUA_VERSION[:2] != (5, 4):
|
||||
logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
|
||||
|
||||
@@ -1002,8 +1014,6 @@ def getmodelname():
|
||||
def get_hidden_size_from_model(model):
|
||||
return model.get_input_embeddings().embedding_dim
|
||||
|
||||
|
||||
|
||||
#==================================================================#
|
||||
# Allow the models to override some settings
|
||||
#==================================================================#
|
||||
@@ -1305,7 +1315,7 @@ def general_startup(override_args=None):
|
||||
parser.add_argument("--noaimenu", action='store_true', help="Disables the ability to select the AI")
|
||||
parser.add_argument("--ngrok", action='store_true', help="Optimizes KoboldAI for Remote Play using Ngrok")
|
||||
parser.add_argument("--localtunnel", action='store_true', help="Optimizes KoboldAI for Remote Play using Localtunnel")
|
||||
parser.add_argument("--host", type=str, default="", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc")
|
||||
parser.add_argument("--host", type=str, default="Disabled", nargs="?", const="", help="Optimizes KoboldAI for LAN Remote Play without using a proxy service. --host opens to all LAN. Enable IP whitelisting by using a comma separated IP list. Supports individual IPs, ranges, and subnets --host 127.0.0.1,127.0.0.2,127.0.0.3,192.168.1.0-192.168.1.255,10.0.0.0/24,etc")
|
||||
parser.add_argument("--port", type=int, help="Specify the port on which the application will be joinable")
|
||||
parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)")
|
||||
parser.add_argument("--model", help="Specify the Model Type to skip the Menu")
|
||||
@@ -1436,17 +1446,14 @@ def general_startup(override_args=None):
|
||||
if args.localtunnel:
|
||||
koboldai_vars.host = True;
|
||||
|
||||
if args.host == "":
|
||||
koboldai_vars.host = True
|
||||
args.unblock = True
|
||||
if args.host:
|
||||
if args.host != "Disabled":
|
||||
# This means --host option was submitted without an argument
|
||||
# Enable all LAN IPs (0.0.0.0/0)
|
||||
koboldai_vars.host = True
|
||||
args.unblock = True
|
||||
if args.host != "":
|
||||
# Check if --host option was submitted with an argument
|
||||
# Parse the supplied IP(s) and add them to the allowed IPs list
|
||||
koboldai_vars.host = True
|
||||
args.unblock = True
|
||||
enable_whitelist = True
|
||||
for ip_str in args.host.split(","):
|
||||
if "/" in ip_str:
|
||||
@@ -1463,6 +1470,7 @@ def general_startup(override_args=None):
|
||||
print(f"Allowed IPs: {allowed_ips}")
|
||||
|
||||
|
||||
|
||||
if args.cpu:
|
||||
koboldai_vars.use_colab_tpu = False
|
||||
|
||||
@@ -1594,6 +1602,7 @@ def get_model_info(model, directory=""):
|
||||
'break_values': break_values, 'gpu_count': gpu_count,
|
||||
'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select,
|
||||
'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False,
|
||||
'bit_4_available': koboldai_vars.bit_4_available if koboldai_vars.experimental_features else False,
|
||||
'show_custom_model_box': show_custom_model_box})
|
||||
if send_horde_models:
|
||||
get_cluster_models({'key': key_value, 'url': default_url})
|
||||
@@ -1767,9 +1776,59 @@ def unload_model():
|
||||
|
||||
#Reload our badwords
|
||||
koboldai_vars.badwordsids = koboldai_settings.badwordsids_default
|
||||
|
||||
|
||||
def prepare_4bit_load(modelpath):
|
||||
paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
|
||||
paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
|
||||
result = False
|
||||
groupsize = -1
|
||||
for p in paths_4bit:
|
||||
p = os.path.join(modelpath, p)
|
||||
val = [v for v in glob.glob(p) if "4bit-old" not in v]
|
||||
if val:
|
||||
result = val[0]
|
||||
fname = Path(result).parts[-1]
|
||||
g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname)
|
||||
if g:
|
||||
groupsize = int(g[0])
|
||||
break
|
||||
|
||||
global monkey_patched_4bit
|
||||
|
||||
# Monkey-patch in old-format pt-file support
|
||||
if not result:
|
||||
print("4-bit file not found, falling back to old format.")
|
||||
for p in paths_4bit_old:
|
||||
p = os.path.join(modelpath, p)
|
||||
if os.path.isfile(p):
|
||||
result = p
|
||||
break
|
||||
|
||||
if not result:
|
||||
print("4-bit old-format file not found, loading failed.")
|
||||
raise RuntimeError(f"4-bit load failed. PT-File not found.")
|
||||
|
||||
import llama, opt, gptneox, gptj, old_quant
|
||||
llama.make_quant = old_quant.old_make_quant
|
||||
opt.make_quant = old_quant.old_make_quant
|
||||
gptneox.make_quant = old_quant.old_make_quant
|
||||
gptj.make_quant = old_quant.old_make_quant
|
||||
monkey_patched_4bit = True
|
||||
elif monkey_patched_4bit:
|
||||
# Undo monkey patch
|
||||
print("Undoing 4-bit old format monkey patch")
|
||||
import llama, opt, gptneox, gptj, quant
|
||||
llama.make_quant = quant.make_quant
|
||||
opt.make_quant = quant.make_quant
|
||||
gptneox.make_quant = quant.make_quant
|
||||
gptj.make_quant = quant.make_quant
|
||||
monkey_patched_4bit = False
|
||||
|
||||
return result, groupsize
|
||||
|
||||
|
||||
def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False):
|
||||
def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False):
|
||||
global model
|
||||
global tokenizer
|
||||
global model_config
|
||||
@@ -1807,7 +1866,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
||||
disk_layers = args.breakmodel_disklayers
|
||||
if breakmodel_args_default_to_cpu and disk_layers is None:
|
||||
disk_layers = args.breakmodel_disklayers = 0
|
||||
|
||||
|
||||
unload_model()
|
||||
|
||||
if online_model == "":
|
||||
@@ -6472,7 +6531,7 @@ def UI_2_load_model(data):
|
||||
koboldai_vars.model = data['model']
|
||||
koboldai_vars.custmodpth = data['path']
|
||||
print("loading Model")
|
||||
load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'])
|
||||
load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'], use_4_bit=data['use_4_bit'])
|
||||
|
||||
#==================================================================#
|
||||
# Event triggered when load story is clicked
|
||||
|
@@ -31,6 +31,7 @@ dependencies:
|
||||
- flask-cors
|
||||
- lupa==1.10
|
||||
- transformers==4.28.0
|
||||
- git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
|
||||
- huggingface_hub==0.12.1
|
||||
- safetensors
|
||||
- accelerate==0.18.0
|
||||
|
@@ -30,9 +30,10 @@ dependencies:
|
||||
- flask-cors
|
||||
- lupa==1.10
|
||||
- transformers==4.28.0
|
||||
- git+https://github.com/huggingface/datasets@test-pandas-2.0.0rc
|
||||
- huggingface_hub==0.12.1
|
||||
- safetensors
|
||||
- accelerate
|
||||
- accelerate==0.18.0
|
||||
- git+https://github.com/VE-FORBRYDERNE/mkultra
|
||||
- ansi2html
|
||||
- flask_compress
|
||||
|
@@ -48,6 +48,8 @@ umamba.exe create -r B:\python\ -n base
|
||||
umamba.exe install --no-shortcuts -r B:\python\ -n base -f "%~dp0\environments\huggingface.yml" -y --always-copy
|
||||
umamba.exe -r B:\ clean -a -y
|
||||
rd B:\Python\pkgs /S /Q
|
||||
call B:\python\condabin\activate
|
||||
pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
|
||||
subst B: /d
|
||||
pause
|
||||
exit
|
||||
@@ -60,5 +62,7 @@ umamba.exe create -r miniconda3\ -n base
|
||||
umamba.exe install --no-shortcuts -r miniconda3 -n base -f environments\huggingface.yml -y --always-copy
|
||||
umamba.exe clean -a -y
|
||||
rd miniconda3\Python\pkgs /S /Q
|
||||
call miniconda3\condabin\activate
|
||||
pip install "https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-win_amd64.whl"
|
||||
pause
|
||||
exit
|
||||
|
@@ -5,6 +5,9 @@ wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -
|
||||
bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
|
||||
# Weird micromamba bug causes it to fail the first time, running it twice just to be safe, the second time is much faster
|
||||
bin/micromamba create -f environments/huggingface.yml -r runtime -n koboldai -y
|
||||
|
||||
# Install quant_cuda module for 4-bit
|
||||
bin/micromamba run -r runtime -n koboldai pip install https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-04-10/quant_cuda-0.0.0-cp38-cp38-linux_x86_64.whl
|
||||
exit
|
||||
fi
|
||||
if [[ $1 = "rocm" ]]; then
|
||||
|
@@ -1211,7 +1211,7 @@ class system_settings(settings):
|
||||
'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy',
|
||||
'serverstarted', 'inference_config', 'image_pipeline', 'summarizer',
|
||||
'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model',
|
||||
'generating_image', 'bit_8_available', 'host', 'hascuda', 'usegpu', 'rng_states']
|
||||
'generating_image', 'bit_8_available', 'bit_4_available', 'host', 'hascuda', 'usegpu', 'rng_states']
|
||||
settings_name = "system"
|
||||
def __init__(self, socketio, koboldai_var):
|
||||
self._socketio = socketio
|
||||
@@ -1305,6 +1305,8 @@ class system_settings(settings):
|
||||
elif torch.cuda.get_device_properties(device).major == 7 and torch.cuda.get_device_properties(device).minor >= 2:
|
||||
self.bit_8_available = True
|
||||
break
|
||||
# Check if repos/gptq exists for 4-bit mode
|
||||
self.bit_4_available = os.path.isdir("repos/gptq")
|
||||
self.seen_messages = []
|
||||
|
||||
|
||||
|
1
repos/gptq
Submodule
1
repos/gptq
Submodule
Submodule repos/gptq added at 50b22e2ba8
@@ -15,7 +15,7 @@ markdown
|
||||
bleach==4.1.0
|
||||
sentencepiece
|
||||
protobuf
|
||||
accelerate
|
||||
accelerate==0.18.0
|
||||
flask-session==0.4.0
|
||||
marshmallow>=3.13
|
||||
apispec-webframeworks
|
||||
|
@@ -1472,6 +1472,7 @@ function show_model_menu(data) {
|
||||
document.getElementById("modelurl").classList.add("hidden");
|
||||
document.getElementById("use_gpu_div").classList.add("hidden");
|
||||
document.getElementById("use_8_bit_div").classList.add("hidden");
|
||||
document.getElementById("use_4_bit_div").classList.add("hidden");
|
||||
document.getElementById("modellayers").classList.add("hidden");
|
||||
document.getElementById("oaimodel").classList.add("hidden");
|
||||
var model_layer_bars = document.getElementById('model_layer_bars');
|
||||
@@ -1646,6 +1647,14 @@ function selected_model_info(data) {
|
||||
document.getElementById("use_8_bit").checked = false;
|
||||
}
|
||||
|
||||
//hide or unhide 4 bit mode
|
||||
if (data.bit_4_available) {
|
||||
document.getElementById("use_4_bit_div").classList.remove("hidden");
|
||||
} else {
|
||||
document.getElementById("use_4_bit_div").classList.add("hidden");
|
||||
document.getElementById("use_4_bit").checked = false;
|
||||
}
|
||||
|
||||
//default URL loading
|
||||
if (data.default_url != null) {
|
||||
document.getElementById("modelurl").value = data.default_url;
|
||||
@@ -1815,7 +1824,7 @@ function selected_model_info(data) {
|
||||
}
|
||||
accept.disabled = false;
|
||||
|
||||
|
||||
set_4_bit_mode(invert=false);
|
||||
}
|
||||
|
||||
function update_gpu_layers() {
|
||||
@@ -1876,7 +1885,8 @@ function load_model() {
|
||||
'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(),
|
||||
'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value,
|
||||
'online_model': selected_models,
|
||||
'use_8_bit': document.getElementById('use_8_bit').checked};
|
||||
'use_8_bit': document.getElementById('use_8_bit').checked,
|
||||
'use_4_bit': document.getElementById('use_4_bit').checked};
|
||||
socket.emit("load_model", message);
|
||||
closePopups();
|
||||
}
|
||||
@@ -3160,6 +3170,15 @@ function save_preset() {
|
||||
closePopups();
|
||||
}
|
||||
|
||||
function set_4_bit_mode(invert=true) {
|
||||
bit_4_status = document.getElementById("use_4_bit").checked;
|
||||
if (invert) {
|
||||
bit_4_status = !bit_4_status;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------General UI Functions------------------------------------
|
||||
function set_ui_level(level) {
|
||||
for (classname of ['setting_container', 'setting_container_single', 'setting_container_single_wide', 'biasing', 'palette_area', 'advanced_theme']) {
|
||||
@@ -7301,4 +7320,4 @@ $el("#gamescreen").addEventListener("paste", function(event) {
|
||||
false,
|
||||
event.clipboardData.getData("text/plain")
|
||||
);
|
||||
});
|
||||
});
|
||||
|
@@ -75,6 +75,10 @@
|
||||
<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_8_bit" checked>
|
||||
<div class="box-label">Use 8 bit mode</div>
|
||||
</div>
|
||||
<div class="box flex-push-right hidden" id=use_4_bit_div>
|
||||
<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_4_bit" checked>
|
||||
<div class="box-label">Use 4 bit mode</div>
|
||||
</div>
|
||||
<button type="button" class="btn popup_load_cancel_button action_button disabled" onclick="load_model()" id="btn_loadmodelaccept" disabled>Load</button>
|
||||
<button type="button" class="btn popup_load_cancel_button" onclick='closePopups();' id="btn_loadmodelclose">Cancel</button>
|
||||
</div>
|
||||
@@ -402,4 +406,4 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="notification-container"></div>
|
||||
<div id="notification-container"></div>
|
||||
|
Reference in New Issue
Block a user