diff --git a/aiserver.py b/aiserver.py index 1eaeda31..08221648 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1613,7 +1613,8 @@ def get_model_info(model, directory=""): 'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel, 'multi_online_models': multi_online_models, 'default_url': default_url, 'disk_break_value': disk_blocks, 'disk_break': utils.HAS_ACCELERATE, 'break_values': break_values, 'gpu_count': gpu_count, - 'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select}) + 'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select, + 'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False}) if send_horde_models: get_cluster_models({'key': key_value, 'url': default_url}) elif key_value != "" and model in [x[1] for x in model_menu['apilist']] and model != 'CLUSTER': @@ -2451,7 +2452,7 @@ def reset_model_settings(): koboldai_vars.revision = None koboldai_vars.lazy_load = True -def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None): +def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False): global model global generator global torch @@ -2460,6 +2461,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal global tokenizer koboldai_vars.aibusy = True koboldai_vars.horde_share = False + if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features: + use_8_bit = False + if use_8_bit: + koboldai_vars.lazy_load = False if(initial_load): use_breakmodel_args = True reset_model_settings() @@ -2896,11 +2901,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) @@ -2913,11 +2918,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) else: old_rebuild_tensor = torch._utils._rebuild_tensor def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride): @@ -2943,18 +2948,18 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) torch._utils._rebuild_tensor = old_rebuild_tensor if not args.colab or args.savemodel: import shutil tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) - if(koboldai_vars.fp32_model): # Use save_pretrained to convert fp32 models to fp16 + if koboldai_vars.fp32_model: # Use save_pretrained to convert fp32 models to fp16 model = model.half() model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB") else: # For fp16 models, we can just copy the model files directly @@ -8483,7 +8488,7 @@ def UI_2_load_model(data): koboldai_vars.model = data['model'] koboldai_vars.custmodpth = data['path'] print("loading Model") - load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl) + load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit']) #==================================================================# # Event triggered when load story is clicked diff --git a/environments/huggingface.yml b/environments/huggingface.yml index f7b146cd..3b3a6307 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -35,4 +35,5 @@ dependencies: - python-socketio[client] - ansi2html - flask_compress - - ijson \ No newline at end of file + - ijson + - bitsandbytes-cuda111 \ No newline at end of file diff --git a/koboldai_settings.py b/koboldai_settings.py index a654f7c8..a914af33 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +import importlib import os, re, time, threading, json, pickle, base64, copy, tqdm, datetime, sys from typing import Union from io import BytesIO @@ -1087,7 +1088,7 @@ class system_settings(settings): 'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model', - 'generating_image'] + 'generating_image', 'bit_8_available'] settings_name = "system" def __init__(self, socketio, koboldai_var): self.socketio = socketio @@ -1171,6 +1172,8 @@ class system_settings(settings): self.keep_img_gen_in_memory = False self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost self.experimental_features = False + #check if bitsandbytes is installed + self.bit_8_available = importlib.util.find_spec("bitsandbytes") is not None and sys.platform.startswith('linux') #We can install bitsandbytes, but it doesn't work on windows, so limit it here @dataclass class _inference_config: diff --git a/requirements.txt b/requirements.txt index 0650e1e1..dba4bacc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,5 @@ diffusers psutil ansi2html flask_compress -ijson \ No newline at end of file +ijson +bitsandbytes-cuda111 \ No newline at end of file diff --git a/static/koboldai.js b/static/koboldai.js index 4397194b..7987d983 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -1437,6 +1437,7 @@ function show_model_menu(data) { document.getElementById("modelkey").value = ""; document.getElementById("modelurl").classList.add("hidden"); document.getElementById("use_gpu_div").classList.add("hidden"); + document.getElementById("use_8_bit_div").classList.add("hidden"); document.getElementById("modellayers").classList.add("hidden"); document.getElementById("oaimodel").classList.add("hidden"); var model_layer_bars = document.getElementById('model_layer_bars'); @@ -1586,6 +1587,14 @@ function selected_model_info(data) { document.getElementById("modelurl").classList.add("hidden"); } + //hide or unhide 8 bit mode + if (data.bit_8_available) { + document.getElementById("use_8_bit_div").classList.remove("hidden"); + } else { + document.getElementById("use_8_bit_div").classList.add("hidden"); + document.getElementById("use_8_bit").checked = false; + } + //default URL loading if (data.default_url != null) { document.getElementById("modelurl").value = data.default_url; @@ -1815,7 +1824,8 @@ function load_model() { message = {'model': model, 'path': path, 'use_gpu': document.getElementById("use_gpu").checked, 'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(), 'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value, - 'online_model': selected_models}; + 'online_model': selected_models, + 'use_8_bit': document.getElementById('use_8_bit').checked}; socket.emit("load_model", message); closePopups(); } diff --git a/templates/popups.html b/templates/popups.html index 47564f07..3e66753a 100644 --- a/templates/popups.html +++ b/templates/popups.html @@ -63,6 +63,10 @@
Use GPU
+