mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Experimental 8-bit support (option will only show on linux systems)
This commit is contained in:
25
aiserver.py
25
aiserver.py
@@ -1613,7 +1613,8 @@ def get_model_info(model, directory=""):
|
||||
'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel, 'multi_online_models': multi_online_models, 'default_url': default_url,
|
||||
'disk_break_value': disk_blocks, 'disk_break': utils.HAS_ACCELERATE,
|
||||
'break_values': break_values, 'gpu_count': gpu_count,
|
||||
'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select})
|
||||
'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select,
|
||||
'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False})
|
||||
if send_horde_models:
|
||||
get_cluster_models({'key': key_value, 'url': default_url})
|
||||
elif key_value != "" and model in [x[1] for x in model_menu['apilist']] and model != 'CLUSTER':
|
||||
@@ -2451,7 +2452,7 @@ def reset_model_settings():
|
||||
koboldai_vars.revision = None
|
||||
koboldai_vars.lazy_load = True
|
||||
|
||||
def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None):
|
||||
def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False):
|
||||
global model
|
||||
global generator
|
||||
global torch
|
||||
@@ -2460,6 +2461,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
||||
global tokenizer
|
||||
koboldai_vars.aibusy = True
|
||||
koboldai_vars.horde_share = False
|
||||
if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features:
|
||||
use_8_bit = False
|
||||
if use_8_bit:
|
||||
koboldai_vars.lazy_load = False
|
||||
if(initial_load):
|
||||
use_breakmodel_args = True
|
||||
reset_model_settings()
|
||||
@@ -2896,11 +2901,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
||||
except Exception as e:
|
||||
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
|
||||
try:
|
||||
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
||||
except Exception as e:
|
||||
if("out of memory" in traceback.format_exc().lower()):
|
||||
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
||||
model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||
model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
||||
elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False)
|
||||
@@ -2913,11 +2918,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
||||
except Exception as e:
|
||||
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
|
||||
try:
|
||||
model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||
model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
||||
except Exception as e:
|
||||
if("out of memory" in traceback.format_exc().lower()):
|
||||
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
||||
model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||
model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
||||
else:
|
||||
old_rebuild_tensor = torch._utils._rebuild_tensor
|
||||
def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride):
|
||||
@@ -2943,18 +2948,18 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
||||
except Exception as e:
|
||||
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
|
||||
try:
|
||||
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||
model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
||||
except Exception as e:
|
||||
if("out of memory" in traceback.format_exc().lower()):
|
||||
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
|
||||
model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem)
|
||||
model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem)
|
||||
|
||||
torch._utils._rebuild_tensor = old_rebuild_tensor
|
||||
|
||||
if not args.colab or args.savemodel:
|
||||
import shutil
|
||||
tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')))
|
||||
if(koboldai_vars.fp32_model): # Use save_pretrained to convert fp32 models to fp16
|
||||
if koboldai_vars.fp32_model: # Use save_pretrained to convert fp32 models to fp16
|
||||
model = model.half()
|
||||
model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB")
|
||||
else: # For fp16 models, we can just copy the model files directly
|
||||
@@ -8483,7 +8488,7 @@ def UI_2_load_model(data):
|
||||
koboldai_vars.model = data['model']
|
||||
koboldai_vars.custmodpth = data['path']
|
||||
print("loading Model")
|
||||
load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl)
|
||||
load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'])
|
||||
|
||||
#==================================================================#
|
||||
# Event triggered when load story is clicked
|
||||
|
@@ -35,4 +35,5 @@ dependencies:
|
||||
- python-socketio[client]
|
||||
- ansi2html
|
||||
- flask_compress
|
||||
- ijson
|
||||
- ijson
|
||||
- bitsandbytes-cuda111
|
@@ -1,4 +1,5 @@
|
||||
from dataclasses import dataclass
|
||||
import importlib
|
||||
import os, re, time, threading, json, pickle, base64, copy, tqdm, datetime, sys
|
||||
from typing import Union
|
||||
from io import BytesIO
|
||||
@@ -1087,7 +1088,7 @@ class system_settings(settings):
|
||||
'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy',
|
||||
'serverstarted', 'inference_config', 'image_pipeline', 'summarizer',
|
||||
'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model',
|
||||
'generating_image']
|
||||
'generating_image', 'bit_8_available']
|
||||
settings_name = "system"
|
||||
def __init__(self, socketio, koboldai_var):
|
||||
self.socketio = socketio
|
||||
@@ -1171,6 +1172,8 @@ class system_settings(settings):
|
||||
self.keep_img_gen_in_memory = False
|
||||
self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost
|
||||
self.experimental_features = False
|
||||
#check if bitsandbytes is installed
|
||||
self.bit_8_available = importlib.util.find_spec("bitsandbytes") is not None and sys.platform.startswith('linux') #We can install bitsandbytes, but it doesn't work on windows, so limit it here
|
||||
|
||||
@dataclass
|
||||
class _inference_config:
|
||||
|
@@ -23,4 +23,5 @@ diffusers
|
||||
psutil
|
||||
ansi2html
|
||||
flask_compress
|
||||
ijson
|
||||
ijson
|
||||
bitsandbytes-cuda111
|
@@ -1437,6 +1437,7 @@ function show_model_menu(data) {
|
||||
document.getElementById("modelkey").value = "";
|
||||
document.getElementById("modelurl").classList.add("hidden");
|
||||
document.getElementById("use_gpu_div").classList.add("hidden");
|
||||
document.getElementById("use_8_bit_div").classList.add("hidden");
|
||||
document.getElementById("modellayers").classList.add("hidden");
|
||||
document.getElementById("oaimodel").classList.add("hidden");
|
||||
var model_layer_bars = document.getElementById('model_layer_bars');
|
||||
@@ -1586,6 +1587,14 @@ function selected_model_info(data) {
|
||||
document.getElementById("modelurl").classList.add("hidden");
|
||||
}
|
||||
|
||||
//hide or unhide 8 bit mode
|
||||
if (data.bit_8_available) {
|
||||
document.getElementById("use_8_bit_div").classList.remove("hidden");
|
||||
} else {
|
||||
document.getElementById("use_8_bit_div").classList.add("hidden");
|
||||
document.getElementById("use_8_bit").checked = false;
|
||||
}
|
||||
|
||||
//default URL loading
|
||||
if (data.default_url != null) {
|
||||
document.getElementById("modelurl").value = data.default_url;
|
||||
@@ -1815,7 +1824,8 @@ function load_model() {
|
||||
message = {'model': model, 'path': path, 'use_gpu': document.getElementById("use_gpu").checked,
|
||||
'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(),
|
||||
'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value,
|
||||
'online_model': selected_models};
|
||||
'online_model': selected_models,
|
||||
'use_8_bit': document.getElementById('use_8_bit').checked};
|
||||
socket.emit("load_model", message);
|
||||
closePopups();
|
||||
}
|
||||
|
@@ -63,6 +63,10 @@
|
||||
<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_gpu" checked>
|
||||
<div class="box-label">Use GPU</div>
|
||||
</div>
|
||||
<div class="box flex-push-right hidden" id=use_8_bit_div>
|
||||
<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_8_bit" checked>
|
||||
<div class="box-label">Use 8 bit mode</div>
|
||||
</div>
|
||||
<button type="button" class="btn popup_load_cancel_button action_button disabled" onclick="load_model()" id="btn_loadmodelaccept" disabled>Load</button>
|
||||
<button type="button" class="btn popup_load_cancel_button" onclick='closePopups();' id="btn_loadmodelclose">Cancel</button>
|
||||
</div>
|
||||
|
Reference in New Issue
Block a user