From 4035221b9c9f3c436257fb296791528e5ec55725 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 28 Nov 2022 16:18:45 -0600 Subject: [PATCH 01/26] Don't activate attention bias until needed --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 7a2a8776..730edc74 100644 --- a/aiserver.py +++ b/aiserver.py @@ -5105,7 +5105,7 @@ def calcsubmit(txt): logger.debug("Submit: get_text time {}s".format(time.time()-start_time)) start_time = time.time() - if koboldai_vars.experimental_features: + if koboldai_vars.experimental_features and any([c.get("attention_multiplier", 1) != 1 for c in koboldai_vars.context]): offset = 0 applied_biases = [] for c in koboldai_vars.context: From 20851485e0ef1e8e1d7b5dfd2f5d6f160e1c0416 Mon Sep 17 00:00:00 2001 From: ebolam Date: Tue, 29 Nov 2022 17:06:23 -0500 Subject: [PATCH 02/26] Experimental 8-bit support (option will only show on linux systems) --- aiserver.py | 25 +++++++++++++++---------- environments/huggingface.yml | 3 ++- koboldai_settings.py | 5 ++++- requirements.txt | 3 ++- static/koboldai.js | 12 +++++++++++- templates/popups.html | 4 ++++ 6 files changed, 38 insertions(+), 14 deletions(-) diff --git a/aiserver.py b/aiserver.py index 1eaeda31..08221648 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1613,7 +1613,8 @@ def get_model_info(model, directory=""): 'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel, 'multi_online_models': multi_online_models, 'default_url': default_url, 'disk_break_value': disk_blocks, 'disk_break': utils.HAS_ACCELERATE, 'break_values': break_values, 'gpu_count': gpu_count, - 'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select}) + 'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select, + 'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False}) if send_horde_models: get_cluster_models({'key': key_value, 'url': default_url}) elif key_value != "" and model in [x[1] for x in model_menu['apilist']] and model != 'CLUSTER': @@ -2451,7 +2452,7 @@ def reset_model_settings(): koboldai_vars.revision = None koboldai_vars.lazy_load = True -def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None): +def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False): global model global generator global torch @@ -2460,6 +2461,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal global tokenizer koboldai_vars.aibusy = True koboldai_vars.horde_share = False + if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features: + use_8_bit = False + if use_8_bit: + koboldai_vars.lazy_load = False if(initial_load): use_breakmodel_args = True reset_model_settings() @@ -2896,11 +2901,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) @@ -2913,11 +2918,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) else: old_rebuild_tensor = torch._utils._rebuild_tensor def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride): @@ -2943,18 +2948,18 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) + model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) torch._utils._rebuild_tensor = old_rebuild_tensor if not args.colab or args.savemodel: import shutil tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) - if(koboldai_vars.fp32_model): # Use save_pretrained to convert fp32 models to fp16 + if koboldai_vars.fp32_model: # Use save_pretrained to convert fp32 models to fp16 model = model.half() model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB") else: # For fp16 models, we can just copy the model files directly @@ -8483,7 +8488,7 @@ def UI_2_load_model(data): koboldai_vars.model = data['model'] koboldai_vars.custmodpth = data['path'] print("loading Model") - load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl) + load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit']) #==================================================================# # Event triggered when load story is clicked diff --git a/environments/huggingface.yml b/environments/huggingface.yml index f7b146cd..3b3a6307 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -35,4 +35,5 @@ dependencies: - python-socketio[client] - ansi2html - flask_compress - - ijson \ No newline at end of file + - ijson + - bitsandbytes-cuda111 \ No newline at end of file diff --git a/koboldai_settings.py b/koboldai_settings.py index a654f7c8..a914af33 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +import importlib import os, re, time, threading, json, pickle, base64, copy, tqdm, datetime, sys from typing import Union from io import BytesIO @@ -1087,7 +1088,7 @@ class system_settings(settings): 'lua_koboldcore', 'sp', 'sp_length', '_horde_pid', 'horde_share', 'aibusy', 'serverstarted', 'inference_config', 'image_pipeline', 'summarizer', 'summary_tokenizer', 'use_colab_tpu', 'noai', 'disable_set_aibusy', 'cloudflare_link', 'tts_model', - 'generating_image'] + 'generating_image', 'bit_8_available'] settings_name = "system" def __init__(self, socketio, koboldai_var): self.socketio = socketio @@ -1171,6 +1172,8 @@ class system_settings(settings): self.keep_img_gen_in_memory = False self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost self.experimental_features = False + #check if bitsandbytes is installed + self.bit_8_available = importlib.util.find_spec("bitsandbytes") is not None and sys.platform.startswith('linux') #We can install bitsandbytes, but it doesn't work on windows, so limit it here @dataclass class _inference_config: diff --git a/requirements.txt b/requirements.txt index 0650e1e1..dba4bacc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,5 @@ diffusers psutil ansi2html flask_compress -ijson \ No newline at end of file +ijson +bitsandbytes-cuda111 \ No newline at end of file diff --git a/static/koboldai.js b/static/koboldai.js index 4397194b..7987d983 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -1437,6 +1437,7 @@ function show_model_menu(data) { document.getElementById("modelkey").value = ""; document.getElementById("modelurl").classList.add("hidden"); document.getElementById("use_gpu_div").classList.add("hidden"); + document.getElementById("use_8_bit_div").classList.add("hidden"); document.getElementById("modellayers").classList.add("hidden"); document.getElementById("oaimodel").classList.add("hidden"); var model_layer_bars = document.getElementById('model_layer_bars'); @@ -1586,6 +1587,14 @@ function selected_model_info(data) { document.getElementById("modelurl").classList.add("hidden"); } + //hide or unhide 8 bit mode + if (data.bit_8_available) { + document.getElementById("use_8_bit_div").classList.remove("hidden"); + } else { + document.getElementById("use_8_bit_div").classList.add("hidden"); + document.getElementById("use_8_bit").checked = false; + } + //default URL loading if (data.default_url != null) { document.getElementById("modelurl").value = data.default_url; @@ -1815,7 +1824,8 @@ function load_model() { message = {'model': model, 'path': path, 'use_gpu': document.getElementById("use_gpu").checked, 'key': document.getElementById('modelkey').value, 'gpu_layers': gpu_layers.join(), 'disk_layers': disk_layers, 'url': document.getElementById("modelurl").value, - 'online_model': selected_models}; + 'online_model': selected_models, + 'use_8_bit': document.getElementById('use_8_bit').checked}; socket.emit("load_model", message); closePopups(); } diff --git a/templates/popups.html b/templates/popups.html index 47564f07..3e66753a 100644 --- a/templates/popups.html +++ b/templates/popups.html @@ -63,6 +63,10 @@
Use GPU
+ From be5d7254ae57a12b754ff3b0458246a5c406a03e Mon Sep 17 00:00:00 2001 From: ebolam Date: Tue, 29 Nov 2022 18:12:33 -0500 Subject: [PATCH 03/26] Fix for KoboldAI Base Docker Image to work with Horde --- docker-standalone/Dockerfile.base | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-standalone/Dockerfile.base b/docker-standalone/Dockerfile.base index 41209c2a..d524f858 100644 --- a/docker-standalone/Dockerfile.base +++ b/docker-standalone/Dockerfile.base @@ -5,6 +5,7 @@ COPY ./install_requirements.sh /opt/koboldai USER root RUN apt update && apt install wget aria2 git bzip2 python3 python3-venv -y RUN ./install_requirements.sh cuda;rm -rf ~/.cache/pip +copy ./KoboldAI-Horde /opt/koboldai/ RUN python3 -m venv /opt/koboldai/KoboldAI-Horde/venv RUN /opt/koboldai/KoboldAI-Horde/venv/bin/pip install -r /opt/koboldai/KoboldAI-Horde/requirements.txt ENV PATH=/opt/conda/bin/:$PATH From 9f41c1366a0ece1a84b19eac970d5d15c5016356 Mon Sep 17 00:00:00 2001 From: ebolam Date: Tue, 29 Nov 2022 18:12:33 -0500 Subject: [PATCH 04/26] Fix for KoboldAI Base Docker Image to work with Horde --- docker-standalone/Dockerfile.base | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-standalone/Dockerfile.base b/docker-standalone/Dockerfile.base index 41209c2a..515dede4 100644 --- a/docker-standalone/Dockerfile.base +++ b/docker-standalone/Dockerfile.base @@ -5,6 +5,7 @@ COPY ./install_requirements.sh /opt/koboldai USER root RUN apt update && apt install wget aria2 git bzip2 python3 python3-venv -y RUN ./install_requirements.sh cuda;rm -rf ~/.cache/pip +RUN git clone https://github.com/db0/KoboldAI-Horde-Bridge /opt/koboldai/KoboldAI-Horde RUN python3 -m venv /opt/koboldai/KoboldAI-Horde/venv RUN /opt/koboldai/KoboldAI-Horde/venv/bin/pip install -r /opt/koboldai/KoboldAI-Horde/requirements.txt ENV PATH=/opt/conda/bin/:$PATH From 2941860f47cd52928cadad1c87afd563781c468f Mon Sep 17 00:00:00 2001 From: ebolam Date: Tue, 29 Nov 2022 18:22:05 -0500 Subject: [PATCH 05/26] Fix for whatever happened on the last commit.... --- docker-standalone/Dockerfile.base | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docker-standalone/Dockerfile.base b/docker-standalone/Dockerfile.base index d31071a0..515dede4 100644 --- a/docker-standalone/Dockerfile.base +++ b/docker-standalone/Dockerfile.base @@ -5,11 +5,7 @@ COPY ./install_requirements.sh /opt/koboldai USER root RUN apt update && apt install wget aria2 git bzip2 python3 python3-venv -y RUN ./install_requirements.sh cuda;rm -rf ~/.cache/pip -<<<<<<< HEAD RUN git clone https://github.com/db0/KoboldAI-Horde-Bridge /opt/koboldai/KoboldAI-Horde -======= -copy ./KoboldAI-Horde /opt/koboldai/ ->>>>>>> be5d7254ae57a12b754ff3b0458246a5c406a03e RUN python3 -m venv /opt/koboldai/KoboldAI-Horde/venv RUN /opt/koboldai/KoboldAI-Horde/venv/bin/pip install -r /opt/koboldai/KoboldAI-Horde/requirements.txt ENV PATH=/opt/conda/bin/:$PATH From be310bd10bf3645c4e7f685fb92880ec2b520174 Mon Sep 17 00:00:00 2001 From: ebolam Date: Tue, 29 Nov 2022 18:57:21 -0500 Subject: [PATCH 06/26] 8 bit update --- environments/huggingface.yml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 3b3a6307..e87652b5 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -36,4 +36,4 @@ dependencies: - ansi2html - flask_compress - ijson - - bitsandbytes-cuda111 \ No newline at end of file + - bitsandbytes \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index dba4bacc..59fa49da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,4 @@ psutil ansi2html flask_compress ijson -bitsandbytes-cuda111 \ No newline at end of file +bitsandbytes \ No newline at end of file From aabc73b3f74467aaf75a71afc7686584a87989cc Mon Sep 17 00:00:00 2001 From: ebolam Date: Tue, 29 Nov 2022 19:29:06 -0500 Subject: [PATCH 07/26] Adding debug for 8 bit --- aiserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/aiserver.py b/aiserver.py index 4a302a13..d99dc16a 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2948,6 +2948,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: + logger.info("Using 8 bit: {}".format(use_8_bit)) model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): From 0581a74ead34eb330822ff235ce0010197a8e3e1 Mon Sep 17 00:00:00 2001 From: ebolam Date: Tue, 29 Nov 2022 19:34:43 -0500 Subject: [PATCH 08/26] Debug for 8 bit --- aiserver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aiserver.py b/aiserver.py index d99dc16a..193a0ef7 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2461,10 +2461,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal global tokenizer koboldai_vars.aibusy = True koboldai_vars.horde_share = False + if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features: use_8_bit = False if use_8_bit: koboldai_vars.lazy_load = False + logger.info("koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if(initial_load): use_breakmodel_args = True reset_model_settings() From 85a5997cc8a382600555d8211e030ea4ef325112 Mon Sep 17 00:00:00 2001 From: ebolam Date: Tue, 29 Nov 2022 19:55:54 -0500 Subject: [PATCH 09/26] 8 bit mode fix --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 193a0ef7..b4105618 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2461,6 +2461,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal global tokenizer koboldai_vars.aibusy = True koboldai_vars.horde_share = False + reset_model_settings() if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features: use_8_bit = False @@ -2469,7 +2470,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal logger.info("koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if(initial_load): use_breakmodel_args = True - reset_model_settings() if not utils.HAS_ACCELERATE: disk_layers = None koboldai_vars.reset_model() From f0fa80cd98440b49ea471ae4f7074d080a133a07 Mon Sep 17 00:00:00 2001 From: ebolam Date: Tue, 29 Nov 2022 20:09:39 -0500 Subject: [PATCH 10/26] 8 bit testing --- aiserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/aiserver.py b/aiserver.py index b4105618..5f2f5b37 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2467,6 +2467,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal use_8_bit = False if use_8_bit: koboldai_vars.lazy_load = False + koboldai_vars.breakmodel = False logger.info("koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if(initial_load): use_breakmodel_args = True From 1b5adf8af5cbdd30f784704bc0e90820c31407fb Mon Sep 17 00:00:00 2001 From: ebolam Date: Tue, 29 Nov 2022 20:19:11 -0500 Subject: [PATCH 11/26] 8 bit --- aiserver.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 5f2f5b37..789e7b1f 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2632,7 +2632,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal elif koboldai_vars.hascuda: if(koboldai_vars.bmsupported): koboldai_vars.usegpu = False - koboldai_vars.breakmodel = True + koboldai_vars.breakmodel = True if not koboldai_vars.use_8_bit else False else: koboldai_vars.breakmodel = False koboldai_vars.usegpu = use_gpu @@ -2680,6 +2680,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # Lazy loader import torch_lazy_loader def get_lazy_load_callback(n_layers, convert_to_float16=True): + logger.info("In Callback - koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if not koboldai_vars.lazy_load: return @@ -2921,6 +2922,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: + logger.info("Using 8 bit: {}".format(use_8_bit)) model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): From 18db7b98f5f8de892750d216d8c6de31b36270f8 Mon Sep 17 00:00:00 2001 From: ebolam Date: Tue, 29 Nov 2022 20:20:15 -0500 Subject: [PATCH 12/26] 8 bit --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 789e7b1f..303977e8 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2632,7 +2632,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal elif koboldai_vars.hascuda: if(koboldai_vars.bmsupported): koboldai_vars.usegpu = False - koboldai_vars.breakmodel = True if not koboldai_vars.use_8_bit else False + koboldai_vars.breakmodel = True if not use_8_bit else False else: koboldai_vars.breakmodel = False koboldai_vars.usegpu = use_gpu From aa5207f5c62aa7c283a1d9d3cb380956c4e0974b Mon Sep 17 00:00:00 2001 From: ebolam Date: Wed, 30 Nov 2022 07:21:16 -0500 Subject: [PATCH 13/26] Ensure we don't save 8bit models --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 303977e8..f5f7c048 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2962,7 +2962,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal torch._utils._rebuild_tensor = old_rebuild_tensor - if not args.colab or args.savemodel: + if (not args.colab or args.savemodel) and not use_8_bit: import shutil tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) if koboldai_vars.fp32_model: # Use save_pretrained to convert fp32 models to fp16 From 06a25b46630298bd047cf8e02a06bc5acc8d6ca0 Mon Sep 17 00:00:00 2001 From: ebolam Date: Wed, 30 Nov 2022 07:48:41 -0500 Subject: [PATCH 14/26] Fix for possible recursion bug --- aiserver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/aiserver.py b/aiserver.py index f5f7c048..b3604785 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2929,6 +2929,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) else: + try: + torch._utils._rebuild_tensor = old_rebuild_tensor + except: + pass old_rebuild_tensor = torch._utils._rebuild_tensor def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride): if(not isinstance(storage, torch_lazy_loader.LazyTensor)): From e278abd7c94b1b734c1757e34fa3ea21dae22f70 Mon Sep 17 00:00:00 2001 From: ebolam Date: Wed, 30 Nov 2022 18:56:22 -0500 Subject: [PATCH 15/26] Copy of VE's 8 bit lazyloading/breakmodel code --- aiserver.py | 18 +++++++++--------- torch_lazy_loader.py | 32 ++++++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/aiserver.py b/aiserver.py index b3604785..3598578b 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2465,10 +2465,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features: use_8_bit = False - if use_8_bit: - koboldai_vars.lazy_load = False - koboldai_vars.breakmodel = False - logger.info("koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if(initial_load): use_breakmodel_args = True if not utils.HAS_ACCELERATE: @@ -2680,7 +2676,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # Lazy loader import torch_lazy_loader def get_lazy_load_callback(n_layers, convert_to_float16=True): - logger.info("In Callback - koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if not koboldai_vars.lazy_load: return @@ -2742,6 +2737,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.loaded_layers = 0 utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=Send_to_socketio()) + if koboldai_vars.bit_8_available: + import bitsandbytes as bnb with zipfile.ZipFile(f, "r") as z: try: last_storage_key = None @@ -2769,8 +2766,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal model_dict[key] = model_dict[key].materialize(f, map_location="cpu") if model_dict[key].dtype is torch.float32: koboldai_vars.fp32_model = True - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: - model_dict[key] = model_dict[key].to(torch.float16) + if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and any(model_dict[key].dtype is t for t in (torch.float32, torch.float16)): + if use_8_bit: + model_dict[key] = bnb.nn.Int8Params(model_dict[key].to(torch.float16), requires_grad=False, has_fp16_weights=False).to(device if device not in ("shared", "disk") else "cpu") + else: + model_dict[key] = model_dict[key].to(torch.float16) if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16): model_dict[key] = model_dict[key].to(torch.float32) if device == "shared": @@ -2882,7 +2882,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal import shutil shutil.move(koboldai_vars.model.replace('/', '_'), "models/{}".format(koboldai_vars.model.replace('/', '_'))) if(koboldai_vars.lazy_load): # If we're using lazy loader, we need to figure out what the model's hidden layers are called - with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True): + with torch_lazy_loader.use_lazy_torch_load(bit_8_available=koboldai_vars.bit_8_available, dematerialized_modules=True, use_accelerate_init_empty_weights=True): try: metamodel = AutoModelForCausalLM.from_config(model_config) except Exception as e: @@ -2890,7 +2890,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal utils.layers_module_names = utils.get_layers_module_names(metamodel) utils.module_names = list(metamodel.state_dict().keys()) utils.named_buffers = list(metamodel.named_buffers(recurse=True)) - with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=koboldai_vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if koboldai_vars.lazy_load else None, dematerialized_modules=True): + with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(bit_8_available=koboldai_vars.bit_8_available, enable=koboldai_vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if koboldai_vars.lazy_load else None, dematerialized_modules=True): if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time lowmem = {} if(os.path.isdir(koboldai_vars.custmodpth)): diff --git a/torch_lazy_loader.py b/torch_lazy_loader.py index 1298335d..8cd7cc50 100644 --- a/torch_lazy_loader.py +++ b/torch_lazy_loader.py @@ -225,7 +225,11 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, miss try: with torch.no_grad(): #param.copy_(input_param) - new_param = torch.nn.Parameter(input_param, requires_grad=param.requires_grad) # This line is new + import bitsandbytes as bnb # This line is new + if isinstance(input_param, bnb.nn.Int8Params): # This line is new + new_param = input_param # This line is new + else: # This line is new + new_param = torch.nn.Parameter(input_param, requires_grad=param.requires_grad) # This line is new if name in self._parameters: # This line is new self._parameters[name] = new_param # This line is new if name in persistent_buffers: # This line is new @@ -277,7 +281,7 @@ def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler pickle.load = old_pickle_load @contextlib.contextmanager -def use_lazy_torch_load(enable=True, callback: Optional[Callable] = None, dematerialized_modules=False, use_accelerate_init_empty_weights=False): +def use_lazy_torch_load(bit_8_available=False, enable=True, callback: Optional[Callable] = None, dematerialized_modules=False, use_accelerate_init_empty_weights=False): if not enable: with use_custom_unpickler(RestrictedUnpickler): yield False @@ -298,17 +302,30 @@ def use_lazy_torch_load(enable=True, callback: Optional[Callable] = None, demate torch.load = torch_load if dematerialized_modules: + old_linear_init = torch.nn.Linear.__init__ + old_embedding_init = torch.nn.Embedding.__init__ + old_layernorm_init = torch.nn.LayerNorm.__init__ + old_load_from_state_dict = torch.nn.Module._load_from_state_dict if use_accelerate_init_empty_weights and utils.HAS_ACCELERATE: import accelerate init_empty_weights = accelerate.init_empty_weights() init_empty_weights.__enter__() else: - old_linear_init = torch.nn.Linear.__init__ - old_embedding_init = torch.nn.Embedding.__init__ - old_layernorm_init = torch.nn.LayerNorm.__init__ - + import accelerate + if bit_8_available: + import bitsandbytes as bnb + def linear_init(self, *args, device=None, **kwargs): - return old_linear_init(self, *args, device="meta", **kwargs) + if linear_init.nested_flag or not bit_8_available: + return old_linear_init(self, *args, device=device, **kwargs) + linear_init.nested_flag = True + try: + self.__class__ = bnb.nn.Linear8bitLt + with accelerate.init_empty_weights(): + return bnb.nn.Linear8bitLt.__init__(self, *args, has_fp16_weights=False, threshold=6.0, **kwargs) + finally: + linear_init.nested_flag = False + linear_init.nested_flag = False def embedding_init(self, *args, device=None, **kwargs): return old_embedding_init(self, *args, device="meta", **kwargs) @@ -319,7 +336,6 @@ def use_lazy_torch_load(enable=True, callback: Optional[Callable] = None, demate torch.nn.Linear.__init__ = linear_init torch.nn.Embedding.__init__ = embedding_init torch.nn.LayerNorm.__init__ = layernorm_init - old_load_from_state_dict = torch.nn.Module._load_from_state_dict torch.nn.Module._load_from_state_dict = _load_from_state_dict with use_custom_unpickler(_LazyUnpickler): From 88c536b194c685cf8d4bb802e0fcb214342c816f Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 1 Dec 2022 08:46:35 -0500 Subject: [PATCH 16/26] Disable bitsandbytes if there are no GPUs with proper CUDA Compute capabilities (needs 7.2 or greater) --- koboldai_settings.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/koboldai_settings.py b/koboldai_settings.py index a914af33..061de2a0 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1173,7 +1173,17 @@ class system_settings(settings): self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost self.experimental_features = False #check if bitsandbytes is installed - self.bit_8_available = importlib.util.find_spec("bitsandbytes") is not None and sys.platform.startswith('linux') #We can install bitsandbytes, but it doesn't work on windows, so limit it here + self.bit_8_available = False + if importlib.util.find_spec("bitsandbytes") is not None and sys.platform.startswith('linux'): #We can install bitsandbytes, but it doesn't work on windows, so limit it here + if torch.cuda.is_available(): + for device in range(torch.cuda.device_count()): + if torch.cuda.get_device_properties(device).major > 7: + self.bit_8_available = True + break + elif torch.cuda.get_device_properties(device).major = 7 and torch.cuda.get_device_properties(device).minor >= 2: + self.bit_8_available = True + break + @dataclass class _inference_config: From 5e7789f90b9901eaaeb3407b1336aa876df1253f Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 1 Dec 2022 08:48:47 -0500 Subject: [PATCH 17/26] Fix --- koboldai_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/koboldai_settings.py b/koboldai_settings.py index 061de2a0..db33142a 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1180,7 +1180,7 @@ class system_settings(settings): if torch.cuda.get_device_properties(device).major > 7: self.bit_8_available = True break - elif torch.cuda.get_device_properties(device).major = 7 and torch.cuda.get_device_properties(device).minor >= 2: + elif torch.cuda.get_device_properties(device).major == 7 and torch.cuda.get_device_properties(device).minor >= 2: self.bit_8_available = True break From 94953def3291031451f4da9a76fd86e2bb81287e Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 1 Dec 2022 09:20:24 -0500 Subject: [PATCH 18/26] More 8 bit debuging --- aiserver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/aiserver.py b/aiserver.py index 3598578b..3f544f3c 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2905,10 +2905,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: + logger.info("Using 8 bit: {}".format(use_8_bit)) model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") + logger.info("Automodel failed, falling back to GPTNeo") model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: @@ -2927,6 +2929,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") + logger.info("Automodel failed, falling back to GPTNeo") model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) else: try: @@ -2962,6 +2965,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") + logger.info("Automodel failed, falling back to GPTNeo") model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) torch._utils._rebuild_tensor = old_rebuild_tensor From fcdfce0373605c8eb85fc950bd3cd0c811b6e348 Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 1 Dec 2022 10:41:13 -0500 Subject: [PATCH 19/26] 8 bit debug --- torch_lazy_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_lazy_loader.py b/torch_lazy_loader.py index 8cd7cc50..dc3edfb3 100644 --- a/torch_lazy_loader.py +++ b/torch_lazy_loader.py @@ -316,7 +316,7 @@ def use_lazy_torch_load(bit_8_available=False, enable=True, callback: Optional[C import bitsandbytes as bnb def linear_init(self, *args, device=None, **kwargs): - if linear_init.nested_flag or not bit_8_available: + if linear_init.nested_flag: return old_linear_init(self, *args, device=device, **kwargs) linear_init.nested_flag = True try: @@ -325,7 +325,7 @@ def use_lazy_torch_load(bit_8_available=False, enable=True, callback: Optional[C return bnb.nn.Linear8bitLt.__init__(self, *args, has_fp16_weights=False, threshold=6.0, **kwargs) finally: linear_init.nested_flag = False - linear_init.nested_flag = False + linear_init.nested_flag = bit_8_available def embedding_init(self, *args, device=None, **kwargs): return old_embedding_init(self, *args, device="meta", **kwargs) From 2505d091811a78073246fbc71d0149b7d98e5d0e Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 1 Dec 2022 10:43:59 -0500 Subject: [PATCH 20/26] 8 bit debug --- torch_lazy_loader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torch_lazy_loader.py b/torch_lazy_loader.py index dc3edfb3..ec99df11 100644 --- a/torch_lazy_loader.py +++ b/torch_lazy_loader.py @@ -316,7 +316,7 @@ def use_lazy_torch_load(bit_8_available=False, enable=True, callback: Optional[C import bitsandbytes as bnb def linear_init(self, *args, device=None, **kwargs): - if linear_init.nested_flag: + if linear_init.nested_flag or not linear_init.bit_8_available: return old_linear_init(self, *args, device=device, **kwargs) linear_init.nested_flag = True try: @@ -325,7 +325,8 @@ def use_lazy_torch_load(bit_8_available=False, enable=True, callback: Optional[C return bnb.nn.Linear8bitLt.__init__(self, *args, has_fp16_weights=False, threshold=6.0, **kwargs) finally: linear_init.nested_flag = False - linear_init.nested_flag = bit_8_available + linear_init.nested_flag = False + linear_init.bit_8_available = bit_8_available def embedding_init(self, *args, device=None, **kwargs): return old_embedding_init(self, *args, device="meta", **kwargs) From 76a0bb71f098ffb9d362df26576509d6ff2db134 Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 1 Dec 2022 10:46:13 -0500 Subject: [PATCH 21/26] 8 bit debug --- torch_lazy_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_lazy_loader.py b/torch_lazy_loader.py index ec99df11..474693a7 100644 --- a/torch_lazy_loader.py +++ b/torch_lazy_loader.py @@ -317,7 +317,7 @@ def use_lazy_torch_load(bit_8_available=False, enable=True, callback: Optional[C def linear_init(self, *args, device=None, **kwargs): if linear_init.nested_flag or not linear_init.bit_8_available: - return old_linear_init(self, *args, device=device, **kwargs) + return old_linear_init(self, *args, device="meta", **kwargs) linear_init.nested_flag = True try: self.__class__ = bnb.nn.Linear8bitLt From 4dfbf809297f0e044de654ce040e40436242e95f Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 1 Dec 2022 11:04:43 -0500 Subject: [PATCH 22/26] Not sure why this fixes it, or why we need this line...... --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 3f544f3c..9aeedfbd 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3006,7 +3006,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.hascuda): if(koboldai_vars.usegpu): koboldai_vars.modeldim = get_hidden_size_from_model(model) - model = model.half().to(koboldai_vars.gpu_device) + #model = model.half().to(koboldai_vars.gpu_device) generator = model.generate elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) koboldai_vars.modeldim = get_hidden_size_from_model(model) From ebdae49746856355700f3feb47b7374f674f2ac7 Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 1 Dec 2022 11:06:54 -0500 Subject: [PATCH 23/26] Align again with VE --- torch_lazy_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_lazy_loader.py b/torch_lazy_loader.py index 474693a7..ec99df11 100644 --- a/torch_lazy_loader.py +++ b/torch_lazy_loader.py @@ -317,7 +317,7 @@ def use_lazy_torch_load(bit_8_available=False, enable=True, callback: Optional[C def linear_init(self, *args, device=None, **kwargs): if linear_init.nested_flag or not linear_init.bit_8_available: - return old_linear_init(self, *args, device="meta", **kwargs) + return old_linear_init(self, *args, device=device, **kwargs) linear_init.nested_flag = True try: self.__class__ = bnb.nn.Linear8bitLt From e1f6de12508d25b3e04d4ca5a6a21975e35ec67c Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 1 Dec 2022 14:54:44 -0500 Subject: [PATCH 24/26] Revert back out VE 8 bit loading --- aiserver.py | 24 ++++++++++-------------- torch_lazy_loader.py | 33 ++++++++------------------------- 2 files changed, 18 insertions(+), 39 deletions(-) diff --git a/aiserver.py b/aiserver.py index 9aeedfbd..b3604785 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2465,6 +2465,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features: use_8_bit = False + if use_8_bit: + koboldai_vars.lazy_load = False + koboldai_vars.breakmodel = False + logger.info("koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if(initial_load): use_breakmodel_args = True if not utils.HAS_ACCELERATE: @@ -2676,6 +2680,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # Lazy loader import torch_lazy_loader def get_lazy_load_callback(n_layers, convert_to_float16=True): + logger.info("In Callback - koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if not koboldai_vars.lazy_load: return @@ -2737,8 +2742,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal koboldai_vars.loaded_layers = 0 utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=Send_to_socketio()) - if koboldai_vars.bit_8_available: - import bitsandbytes as bnb with zipfile.ZipFile(f, "r") as z: try: last_storage_key = None @@ -2766,11 +2769,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal model_dict[key] = model_dict[key].materialize(f, map_location="cpu") if model_dict[key].dtype is torch.float32: koboldai_vars.fp32_model = True - if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and any(model_dict[key].dtype is t for t in (torch.float32, torch.float16)): - if use_8_bit: - model_dict[key] = bnb.nn.Int8Params(model_dict[key].to(torch.float16), requires_grad=False, has_fp16_weights=False).to(device if device not in ("shared", "disk") else "cpu") - else: - model_dict[key] = model_dict[key].to(torch.float16) + if convert_to_float16 and breakmodel.primary_device != "cpu" and koboldai_vars.hascuda and (koboldai_vars.breakmodel or koboldai_vars.usegpu) and model_dict[key].dtype is torch.float32: + model_dict[key] = model_dict[key].to(torch.float16) if breakmodel.primary_device == "cpu" or (not koboldai_vars.usegpu and not koboldai_vars.breakmodel and model_dict[key].dtype is torch.float16): model_dict[key] = model_dict[key].to(torch.float32) if device == "shared": @@ -2882,7 +2882,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal import shutil shutil.move(koboldai_vars.model.replace('/', '_'), "models/{}".format(koboldai_vars.model.replace('/', '_'))) if(koboldai_vars.lazy_load): # If we're using lazy loader, we need to figure out what the model's hidden layers are called - with torch_lazy_loader.use_lazy_torch_load(bit_8_available=koboldai_vars.bit_8_available, dematerialized_modules=True, use_accelerate_init_empty_weights=True): + with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True): try: metamodel = AutoModelForCausalLM.from_config(model_config) except Exception as e: @@ -2890,7 +2890,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal utils.layers_module_names = utils.get_layers_module_names(metamodel) utils.module_names = list(metamodel.state_dict().keys()) utils.named_buffers = list(metamodel.named_buffers(recurse=True)) - with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(bit_8_available=koboldai_vars.bit_8_available, enable=koboldai_vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if koboldai_vars.lazy_load else None, dematerialized_modules=True): + with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=koboldai_vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if koboldai_vars.lazy_load else None, dematerialized_modules=True): if(koboldai_vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time lowmem = {} if(os.path.isdir(koboldai_vars.custmodpth)): @@ -2905,12 +2905,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - logger.info("Using 8 bit: {}".format(use_8_bit)) model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - logger.info("Automodel failed, falling back to GPTNeo") model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: @@ -2929,7 +2927,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - logger.info("Automodel failed, falling back to GPTNeo") model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) else: try: @@ -2965,7 +2962,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - logger.info("Automodel failed, falling back to GPTNeo") model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) torch._utils._rebuild_tensor = old_rebuild_tensor @@ -3006,7 +3002,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if(koboldai_vars.hascuda): if(koboldai_vars.usegpu): koboldai_vars.modeldim = get_hidden_size_from_model(model) - #model = model.half().to(koboldai_vars.gpu_device) + model = model.half().to(koboldai_vars.gpu_device) generator = model.generate elif(koboldai_vars.breakmodel): # Use both RAM and VRAM (breakmodel) koboldai_vars.modeldim = get_hidden_size_from_model(model) diff --git a/torch_lazy_loader.py b/torch_lazy_loader.py index ec99df11..1298335d 100644 --- a/torch_lazy_loader.py +++ b/torch_lazy_loader.py @@ -225,11 +225,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, miss try: with torch.no_grad(): #param.copy_(input_param) - import bitsandbytes as bnb # This line is new - if isinstance(input_param, bnb.nn.Int8Params): # This line is new - new_param = input_param # This line is new - else: # This line is new - new_param = torch.nn.Parameter(input_param, requires_grad=param.requires_grad) # This line is new + new_param = torch.nn.Parameter(input_param, requires_grad=param.requires_grad) # This line is new if name in self._parameters: # This line is new self._parameters[name] = new_param # This line is new if name in persistent_buffers: # This line is new @@ -281,7 +277,7 @@ def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler pickle.load = old_pickle_load @contextlib.contextmanager -def use_lazy_torch_load(bit_8_available=False, enable=True, callback: Optional[Callable] = None, dematerialized_modules=False, use_accelerate_init_empty_weights=False): +def use_lazy_torch_load(enable=True, callback: Optional[Callable] = None, dematerialized_modules=False, use_accelerate_init_empty_weights=False): if not enable: with use_custom_unpickler(RestrictedUnpickler): yield False @@ -302,31 +298,17 @@ def use_lazy_torch_load(bit_8_available=False, enable=True, callback: Optional[C torch.load = torch_load if dematerialized_modules: - old_linear_init = torch.nn.Linear.__init__ - old_embedding_init = torch.nn.Embedding.__init__ - old_layernorm_init = torch.nn.LayerNorm.__init__ - old_load_from_state_dict = torch.nn.Module._load_from_state_dict if use_accelerate_init_empty_weights and utils.HAS_ACCELERATE: import accelerate init_empty_weights = accelerate.init_empty_weights() init_empty_weights.__enter__() else: - import accelerate - if bit_8_available: - import bitsandbytes as bnb - + old_linear_init = torch.nn.Linear.__init__ + old_embedding_init = torch.nn.Embedding.__init__ + old_layernorm_init = torch.nn.LayerNorm.__init__ + def linear_init(self, *args, device=None, **kwargs): - if linear_init.nested_flag or not linear_init.bit_8_available: - return old_linear_init(self, *args, device=device, **kwargs) - linear_init.nested_flag = True - try: - self.__class__ = bnb.nn.Linear8bitLt - with accelerate.init_empty_weights(): - return bnb.nn.Linear8bitLt.__init__(self, *args, has_fp16_weights=False, threshold=6.0, **kwargs) - finally: - linear_init.nested_flag = False - linear_init.nested_flag = False - linear_init.bit_8_available = bit_8_available + return old_linear_init(self, *args, device="meta", **kwargs) def embedding_init(self, *args, device=None, **kwargs): return old_embedding_init(self, *args, device="meta", **kwargs) @@ -337,6 +319,7 @@ def use_lazy_torch_load(bit_8_available=False, enable=True, callback: Optional[C torch.nn.Linear.__init__ = linear_init torch.nn.Embedding.__init__ = embedding_init torch.nn.LayerNorm.__init__ = layernorm_init + old_load_from_state_dict = torch.nn.Module._load_from_state_dict torch.nn.Module._load_from_state_dict = _load_from_state_dict with use_custom_unpickler(_LazyUnpickler): From b28d8e76dc65c3158efe584be1b850af4cc37b74 Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 1 Dec 2022 18:02:05 -0500 Subject: [PATCH 25/26] Reverting farther back --- aiserver.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/aiserver.py b/aiserver.py index b3604785..4a302a13 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2461,16 +2461,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal global tokenizer koboldai_vars.aibusy = True koboldai_vars.horde_share = False - reset_model_settings() - if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features: use_8_bit = False if use_8_bit: koboldai_vars.lazy_load = False - koboldai_vars.breakmodel = False - logger.info("koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if(initial_load): use_breakmodel_args = True + reset_model_settings() if not utils.HAS_ACCELERATE: disk_layers = None koboldai_vars.reset_model() @@ -2632,7 +2629,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal elif koboldai_vars.hascuda: if(koboldai_vars.bmsupported): koboldai_vars.usegpu = False - koboldai_vars.breakmodel = True if not use_8_bit else False + koboldai_vars.breakmodel = True else: koboldai_vars.breakmodel = False koboldai_vars.usegpu = use_gpu @@ -2680,7 +2677,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # Lazy loader import torch_lazy_loader def get_lazy_load_callback(n_layers, convert_to_float16=True): - logger.info("In Callback - koboldai_vars.lazy_load: {}".format(koboldai_vars.lazy_load)) if not koboldai_vars.lazy_load: return @@ -2922,17 +2918,12 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - logger.info("Using 8 bit: {}".format(use_8_bit)) model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) else: - try: - torch._utils._rebuild_tensor = old_rebuild_tensor - except: - pass old_rebuild_tensor = torch._utils._rebuild_tensor def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride): if(not isinstance(storage, torch_lazy_loader.LazyTensor)): @@ -2957,7 +2948,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - logger.info("Using 8 bit: {}".format(use_8_bit)) model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): @@ -2966,7 +2956,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal torch._utils._rebuild_tensor = old_rebuild_tensor - if (not args.colab or args.savemodel) and not use_8_bit: + if not args.colab or args.savemodel: import shutil tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) if koboldai_vars.fp32_model: # Use save_pretrained to convert fp32 models to fp16 From 457b7a46c4d1652338f2a05b711fb789c6a1ca32 Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 1 Dec 2022 18:10:57 -0500 Subject: [PATCH 26/26] More undo of 8 bit --- aiserver.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/aiserver.py b/aiserver.py index 4a302a13..250a0866 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2461,10 +2461,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal global tokenizer koboldai_vars.aibusy = True koboldai_vars.horde_share = False - if not koboldai_vars.bit_8_available or not koboldai_vars.experimental_features: - use_8_bit = False - if use_8_bit: - koboldai_vars.lazy_load = False if(initial_load): use_breakmodel_args = True reset_model_settings() @@ -2901,11 +2897,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) + model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))): try: tokenizer = AutoTokenizer.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_fast=False) @@ -2918,11 +2914,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) + model = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) + model = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", **lowmem) else: old_rebuild_tensor = torch._utils._rebuild_tensor def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride): @@ -2948,18 +2944,18 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except Exception as e: tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache") try: - model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) + model = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) except Exception as e: if("out of memory" in traceback.format_exc().lower()): raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") - model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", load_in_8bit=use_8_bit, device_map="auto", **lowmem) + model = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", **lowmem) torch._utils._rebuild_tensor = old_rebuild_tensor if not args.colab or args.savemodel: import shutil tokenizer.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_'))) - if koboldai_vars.fp32_model: # Use save_pretrained to convert fp32 models to fp16 + if(koboldai_vars.fp32_model): # Use save_pretrained to convert fp32 models to fp16 model = model.half() model.save_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), max_shard_size="500MiB") else: # For fp16 models, we can just copy the model files directly