From 83c74e12f8c5da6d6fb206ba882aad3a2eb3a359 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 15 Sep 2023 17:00:40 +0800 Subject: [PATCH 01/18] Updated Kobold Lite to v63. You may want to wait a day or two to confirm its stable before merging --- static/klite.html | 940 +++++++++++++++++++++++++++++++++------------- 1 file changed, 675 insertions(+), 265 deletions(-) diff --git a/static/klite.html b/static/klite.html index cf30ee07..cf97eca7 100644 --- a/static/klite.html +++ b/static/klite.html @@ -3,7 +3,7 @@ +
Quick Presets ?Pick from an easy selection of curated generation presets, or configure your own.
@@ -9057,7 +9378,6 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp and Kob
-
Temperature ?
512
-
2048
+
2048
-
+
Auto-Adjust Limits
@@ -9116,7 +9436,7 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp and Kob
16
512
-
+
Auto-Adjust Limits
@@ -9139,10 +9459,126 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp and Kob
+ + +
+
+
Repetition Penalty ?Used to penalize words that were already generated or belong to + the context (Going over 1.2 breaks 6B models).
+ +
+
+
+
1
+
3
+
+
+ + +
+ +
+ + +
+
+
Format ?Story Mode is best for novel style writing. Adventure Mode is best for Interactive Fiction RPGs. Chat Mode is best for chat conversations with the AI. Instruct mode is for giving the AI ChatGPT styled tasks.
+ + + + + + +
+
+ +
+
+ + + +
+
+
+ + + @@ -9205,8 +9652,9 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp and Kob
- -
+
+ + -
-
Format ?Story Mode is best for novel style writing. Adventure Mode is best for Interactive Fiction RPGs. Chat Mode is best for chat conversations with the AI. Instruct mode is for giving the AI ChatGPT styled tasks.
- +
+ +
+
Idle Responses 
+ + - - - -
-
-
-
-
Autoscroll
- -
+
Trim Sentences
@@ -9345,25 +9733,37 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp and Kob
-
Unban Tokens (KAI)
+
Unban EOS Tokens
+
+ + +
-
Persist Session
+
Persist Autosave Session
-
Export Settings
+
JSON Exports Settings
+
+
Show Rename Save File
+ +
+
+
Autoscroll Text
+ +
Inverted Colors
- +
+
@@ -9639,6 +10045,10 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp and Kob
Min Height:
px
+
+
Horizontally-centered text:
+ +
Margin (px):
@@ -9776,7 +10186,7 @@ if ('serviceWorker' in navigator) { //for local mode, we do not load any PWA service worker. //this will prevent PWA functionality locally but will avoid the scary 404 errors - if(localmode) + if(localflag) { console.log("Try to register service worker..."); try { From fd87ebdb66751adf87df0b9e593f72e11c776930 Mon Sep 17 00:00:00 2001 From: Henk Date: Sat, 16 Sep 2023 06:25:12 +0200 Subject: [PATCH 02/18] Horde length passtrough --- koboldai_settings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/koboldai_settings.py b/koboldai_settings.py index b7408b93..9d2f5f4d 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1378,6 +1378,8 @@ class system_settings(settings): bridge_data.horde_url = self._koboldai_var.horde_url bridge_data.api_key = self._koboldai_var.horde_api_key bridge_data.scribe_name = self._koboldai_var.horde_worker_name + bridge_data.max_length = self._koboldai_var.genamt + bridge_data.max_context_length = self._koboldai_var.max_length bridge_data.disable_terminal_ui = self._koboldai_var.host if bridge_data.worker_name == "My Awesome Instance": bridge_data.worker_name = f"KoboldAI UI Instance #{random.randint(-100000000, 100000000)}" From d36049f56fabb3c981755cf753affe64a488b8b7 Mon Sep 17 00:00:00 2001 From: Henk Date: Sat, 16 Sep 2023 06:30:35 +0200 Subject: [PATCH 03/18] Newer Horde Worker --- AI-Horde-Worker | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AI-Horde-Worker b/AI-Horde-Worker index 755696b9..594bed95 160000 --- a/AI-Horde-Worker +++ b/AI-Horde-Worker @@ -1 +1 @@ -Subproject commit 755696b9d4464e4167bfea5fd426686420015038 +Subproject commit 594bed958a2e3fadcabf999a023dfe36fb0ad7dd From 613b02e6f818dc4252884f8125c7197c28e3689b Mon Sep 17 00:00:00 2001 From: Henk Date: Sat, 16 Sep 2023 15:56:44 +0200 Subject: [PATCH 04/18] Worker main branch --- AI-Horde-Worker | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AI-Horde-Worker b/AI-Horde-Worker index 594bed95..3e357f4d 160000 --- a/AI-Horde-Worker +++ b/AI-Horde-Worker @@ -1 +1 @@ -Subproject commit 594bed958a2e3fadcabf999a023dfe36fb0ad7dd +Subproject commit 3e357f4d8b284a637564024802c22fc3b19a5ffc From 806fc4b8ad561bca0691ce6b7235dd9e83cc12d2 Mon Sep 17 00:00:00 2001 From: Disty0 Date: Tue, 19 Sep 2023 17:09:51 +0300 Subject: [PATCH 05/18] GPTQ support for IPEX --- environments/ipex.yml | 19 +++++++++++--- modeling/ipex/attention.py | 54 +++++++++++++++++++++++++++----------- 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/environments/ipex.yml b/environments/ipex.yml index de2e3de8..73b6025d 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -24,18 +24,22 @@ dependencies: - psutil - pip: - -f https://developer.intel.com/ipex-whl-stable-xpu - - torch==2.0.1a0 - - intel_extension_for_pytorch==2.0.110+xpu + - torch==2.0.1a0; sys_platform == 'linux' + - torch==2.0.0a0; sys_platform == 'win32' + - intel_extension_for_pytorch==2.0.110+xpu; sys_platform == 'linux' + - intel_extension_for_pytorch==2.0.110+gitba7f6c1; sys_platform == 'win32' + - intel-extension-for-transformers - flask-cloudflared==0.0.10 - flask-ngrok - flask-cors - lupa==1.10 - transformers[sentencepiece]==4.33.1 - huggingface_hub==0.16.4 - - optimum[onnxruntime]==1.12.0 + - optimum[openvino,nncf,neural-compressor]==1.12.0 - safetensors==0.3.3 - - accelerate==0.20.3 + - accelerate==0.21.0 - git+https://github.com/VE-FORBRYDERNE/mkultra + - flask-session - ansi2html - flask_compress - ijson @@ -43,7 +47,14 @@ dependencies: - pydub - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ + - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - einops - peft==0.3.0 + - scipy + - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - windows-curses; sys_platform == 'win32' - pynvml \ No newline at end of file diff --git a/modeling/ipex/attention.py b/modeling/ipex/attention.py index d7335bfa..e38689f2 100644 --- a/modeling/ipex/attention.py +++ b/modeling/ipex/attention.py @@ -64,8 +64,14 @@ def torch_bmm(input, mat2, *, out=None): original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False): #ARC GPUs can't allocate more than 4GB to a single block, Slice it: - shape_one, batch_size_attention, query_tokens, shape_four = query.shape - block_multiply = 2.4 if query.dtype == torch.float32 else 1.2 + if len(query.shape) == 3: + batch_size_attention, query_tokens, shape_four = query.shape + shape_one = 1 + no_shape_one = True + else: + shape_one, batch_size_attention, query_tokens, shape_four = query.shape + no_shape_one = False + block_multiply = 3.6 if query.dtype == torch.float32 else 1.8 block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB split_slice_size = batch_size_attention if block_size >= 4000: @@ -101,21 +107,39 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name start_idx_2 = i2 * split_2_slice_size end_idx_2 = (i2 + 1) * split_2_slice_size - hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention( - query[:, start_idx:end_idx, start_idx_2:end_idx_2], - key[:, start_idx:end_idx, start_idx_2:end_idx_2], - value[:, start_idx:end_idx, start_idx_2:end_idx_2], - attn_mask=attn_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask, + if no_shape_one: + hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention( + query[start_idx:end_idx, start_idx_2:end_idx_2], + key[start_idx:end_idx, start_idx_2:end_idx_2], + value[start_idx:end_idx, start_idx_2:end_idx_2], + attn_mask=attn_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask, + dropout_p=dropout_p, is_causal=is_causal + ) + else: + hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention( + query[:, start_idx:end_idx, start_idx_2:end_idx_2], + key[:, start_idx:end_idx, start_idx_2:end_idx_2], + value[:, start_idx:end_idx, start_idx_2:end_idx_2], + attn_mask=attn_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask, + dropout_p=dropout_p, is_causal=is_causal + ) + else: + if no_shape_one: + hidden_states[start_idx:end_idx] = original_scaled_dot_product_attention( + query[start_idx:end_idx], + key[start_idx:end_idx], + value[start_idx:end_idx], + attn_mask=attn_mask[start_idx:end_idx] if attn_mask is not None else attn_mask, + dropout_p=dropout_p, is_causal=is_causal + ) + else: + hidden_states[:, start_idx:end_idx] = original_scaled_dot_product_attention( + query[:, start_idx:end_idx], + key[:, start_idx:end_idx], + value[:, start_idx:end_idx], + attn_mask=attn_mask[:, start_idx:end_idx] if attn_mask is not None else attn_mask, dropout_p=dropout_p, is_causal=is_causal ) - else: - hidden_states[:, start_idx:end_idx] = original_scaled_dot_product_attention( - query[:, start_idx:end_idx], - key[:, start_idx:end_idx], - value[:, start_idx:end_idx], - attn_mask=attn_mask[:, start_idx:end_idx] if attn_mask is not None else attn_mask, - dropout_p=dropout_p, is_causal=is_causal - ) else: return original_scaled_dot_product_attention( query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal From 55251c6b8ee69d4d5d18276543df591f92d3e8a9 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Wed, 20 Sep 2023 20:30:30 +0800 Subject: [PATCH 06/18] updated kobold lite to v66 --- static/klite.html | 354 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 272 insertions(+), 82 deletions(-) diff --git a/static/klite.html b/static/klite.html index cf97eca7..b30aeac4 100644 --- a/static/klite.html +++ b/static/klite.html @@ -3,7 +3,7 @@ +
From d8877b642de81284bcd7725e06547c66591f7ff4 Mon Sep 17 00:00:00 2001 From: Nick Perez Date: Sat, 23 Sep 2023 00:53:48 -0400 Subject: [PATCH 07/18] [gptq_hf_torch] Fix typo in model type check `model_tseype` -> `model_type` --- modeling/inference_models/gptq_hf_torch/class.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 3094dc33..62e89072 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -362,7 +362,7 @@ class model_backend(HFTorchInferenceModel): model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "opt": model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) - elif model_tseype == "mpt": + elif model_type == "mpt": model = load_quant_offload_device_map(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "gpt_bigcode": model = load_quant_offload_device_map(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias).half() From 90959c3dcf43cd1bd0b98b62e3ae9fa62bcd28f6 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 24 Sep 2023 23:01:35 +0800 Subject: [PATCH 08/18] updated lite to v70 --- static/klite.html | 432 ++++++++++++++++++++++++++++++---------------- 1 file changed, 285 insertions(+), 147 deletions(-) diff --git a/static/klite.html b/static/klite.html index b30aeac4..9a4bca92 100644 --- a/static/klite.html +++ b/static/klite.html @@ -3,7 +3,7 @@
@@ -10217,21 +10355,12 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp and Kob
You 🖌️
AI 🖌️
- - - + +
+
Rounded Bubbles:
+ +
+
Min Height:
px
@@ -10275,7 +10404,7 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp and Kob
Portrait Style:
- @@ -10284,12 +10413,21 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp and Kob
-
Portrait Size:
-
W:
-
H:
-
px
+
User Portrait:
+
Size:
+
px
+
A/R:
+
+
+
AI Portrait:
+
Size:
+
px
+
A/R:
+
+
+
Show Names (Chat Mode):
+
-
@@ -10313,38 +10451,38 @@ Kobold Lite is under the AGPL v3.0 License for the purposes of koboldcpp and Kob
-
Markdown:
+
Style Text:
Colors:
-
text🖌️
-
"speech"🖌️
-
*action*🖌️
+
text🖌️
+
"speech"🖌️
+
*action*🖌️
You:
-
text🖌️
-
"speech"🖌️
-
*action*🖌️
+
text🖌️
+
"speech"🖌️
+
*action*🖌️
AI:
-
text🖌️
-
"speech"🖌️
-
*action*🖌️
+
text🖌️
+
"speech"🖌️
+
*action*🖌️
System:
-
text🖌️
-
"speech"🖌️
-
*action*🖌️
+
text🖌️
+
"speech"🖌️
+
*action*🖌️
Code blocks:
-
background🖌️
-
foreground🖌️
+
background🖌️
+
foreground🖌️

From 9e51a50bcd6c4e287d4cd27a9c3a8303f8416cf4 Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 01:08:53 +0200 Subject: [PATCH 09/18] Llama fixes for Mistral --- modeling/inference_models/hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 7e291b93..8cb52d69 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -232,7 +232,7 @@ class HFInferenceModel(InferenceModel): self.model_type = str(self.model_config.model_type) # These are model specific tokenizer overrides if a model has bad defaults - if self.model_type == "llama": + if self.model_type == "llama" or self.model_type == "mistral": # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer self.tokenizer.add_bos_token = False self.tokenizer.legacy = False From 6fdf83aad5f93559f8c332c7c2521e6058f09883 Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 01:48:38 +0200 Subject: [PATCH 10/18] Basic Exllama2 --- modeling/inference_models/exllamav2/class.py | 417 +++++++++++++++++++ 1 file changed, 417 insertions(+) create mode 100644 modeling/inference_models/exllamav2/class.py diff --git a/modeling/inference_models/exllamav2/class.py b/modeling/inference_models/exllamav2/class.py new file mode 100644 index 00000000..95795506 --- /dev/null +++ b/modeling/inference_models/exllamav2/class.py @@ -0,0 +1,417 @@ +from __future__ import annotations +try: + import time, json + import torch + import requests + import numpy as np + from typing import List, Optional, Union + import os + import glob + from pathlib import Path + import re + import warnings + import gc + + import utils + from logger import logger + + from modeling import warpers + from modeling.warpers import Warper + from modeling.stoppers import Stoppers + from modeling.post_token_hooks import PostTokenHooks + from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, + ModelCapabilities, + ) + + from modeling.tokenizer import GenericTokenizer + + + from exllamav2.model import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config + from transformers import LlamaTokenizer + from exllamav2.generator import ExLlamaV2StreamingGenerator + load_failed = False +except: + load_failed = True + +model_backend_type = "Exl2" +model_backend_name = "ExLlama V2" + +# When set to true, messages will appear in the console if samplers are not +# changing the scores. Keep in mind some samplers don't always change the +# scores for each token. +LOG_SAMPLER_NO_EFFECT = False + +class model_backend(InferenceModel): + def __init__(self) -> None: + super().__init__() + self.model_config = None + + self.model = None + self.tokenizer = None + self.cache = None + self.generator = None + + self.model_name = "" + self.path = None + + self.post_token_hooks = [ + PostTokenHooks.stream_tokens, + ] + + self.stopper_hooks = [ + Stoppers.core_stopper, + Stoppers.dynamic_wi_scanner, + Stoppers.singleline_stopper, + Stoppers.chat_mode_stopper, + Stoppers.stop_sequence_stopper, + ] + + self.capabilties = ModelCapabilities( + embedding_manipulation=False, + post_token_hooks=True, + stopper_hooks=True, + post_token_probs=False, + ) + self.disable = load_failed + + def is_valid(self, model_name, model_path, menu_path): + try: + self.model_config = self._load_config(model_name, model_path) + #TODO check if model is valid + return True + except: + return False + + def get_local_model_path(self): + return self.path or os.path.join("models", self.model_name.replace("/", "_")) + + def _load_config(self, model_name, model_path): + config = ExLlamaV2Config() + if model_path is not None and os.path.exists(model_path): + config.model_dir = model_path + elif os.path.exists("models/{}".format(model_name.replace('/', '_'))): + config.model_dir = "models/{}".format(model_name.replace('/', '_')) + config.prepare() + + return config + + def _load(self, save_model: bool, initial_load: bool) -> None: + self.model = self._get_model(self.get_local_model_path(), {}) + #TODO support GPU split + self.model.load(None) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + + self.cache = ExLlamaV2Cache(self.model) + + self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer.tokenizer) + + def _post_load(self) -> None: + # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer + self.tokenizer.add_bos_token = False + + # HF transformers no longer supports decode_with_prefix_space + # We work around this by wrapping decode, encode, and __call__ + # with versions that work around the 'prefix space' misfeature + # of sentencepiece. + vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size)) + has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")} + + # Wrap 'decode' with a method that always returns text starting with a space + # when the head token starts with a space. This is what 'decode_with_prefix_space' + # used to do, and we implement it using the same technique (building a cache of + # tokens that should have a prefix space, and then prepending a space if the first + # token is in this set.) We also work around a bizarre behavior in which decoding + # a single token 13 behaves differently than decoding a squence containing only [13]. + original_decode = type(self.tokenizer.tokenizer).decode + def decode_wrapper(self, token_ids, *args, **kwargs): + first = None + # Note, the code below that wraps single-value token_ids in a list + # is to work around this wonky behavior: + # >>> t.decode(13) + # '<0x0A>' + # >>> t.decode([13]) + # '\n' + # Not doing this causes token streaming to receive <0x0A> characters + # instead of newlines. + if isinstance(token_ids, int): + first = token_ids + token_ids = [first] + elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor + # Tensors don't support the Python standard of 'empty is False' + # and the special case of dimension 0 tensors also needs to be + # handled separately. + if token_ids.dim() == 0: + first = int(token_ids.item()) + token_ids = [first] + elif len(token_ids) > 0: + first = int(token_ids[0]) + elif token_ids is not None and len(token_ids) > 0: + first = token_ids[0] + result = original_decode(self, token_ids, *args, **kwargs) + if first is not None and first in has_prefix_space: + result = " " + result + return result + # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it + object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer)) + + # Wrap encode and __call__ to work around the 'prefix space' misfeature also. + # The problem is that "Bob" at the start of text is encoded as if it is + # " Bob". This creates a problem because it means you can't split text, encode + # the pieces, concatenate the tokens, decode them, and get the original text back. + # The workaround is to prepend a known token that (1) starts with a space; and + # (2) is not the prefix of any other token. After searching through the vocab + # " ," (space comma) is the only token containing only printable ascii characters + # that fits this bill. By prepending ',' to the text, the original encode + # method always returns [1919, ...], where the tail of the sequence is the + # actual encoded result we want without the prefix space behavior. + original_encode = type(self.tokenizer.tokenizer).encode + def encode_wrapper(self, text, *args, **kwargs): + if type(text) is str: + text = ',' + text + result = original_encode(self, text, *args, **kwargs) + result = result[1:] + else: + result = original_encode(self, text, *args, **kwargs) + return result + object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer)) + + # Since 'encode' is documented as being deprecated, also override __call__. + # This doesn't appear to currently be used by KoboldAI, but doing so + # in case someone uses it in the future. + original_call = type(self.tokenizer.tokenizer).__call__ + def call_wrapper(self, text, *args, **kwargs): + if type(text) is str: + text = ',' + text + result = original_call(self, text, *args, **kwargs) + result = result[1:] + else: + result = original_call(self, text, *args, **kwargs) + return result + object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer)) + + # Cache the newline token (for single line mode) + # Since there is only one Llama token containing newline, just encode \n + self.newline_tokens = self.tokenizer.encode("\n") + self.bracket_tokens = [i for i, tok in enumerate(vocab) if '[' in tok or ']' in tok] + self.tokenizer._koboldai_header = self.tokenizer.encode("") + + def unload(self): + #self.model_config = None # This breaks more than it fixes - Henk + + self.model = None + self.tokenizer = None + self.cache = None + self.generator = None + + self.model_name = "" + self.path = None + + with torch.no_grad(): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated") + for tensor in gc.get_objects(): + try: + if torch.is_tensor(tensor): + tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype)) + except: + pass + gc.collect() + try: + with torch.no_grad(): + torch.cuda.empty_cache() + except: + pass + + def _apply_warpers( + self, scores: torch.Tensor, input_ids: torch.Tensor + ) -> torch.Tensor: + warpers.update_settings() + + if LOG_SAMPLER_NO_EFFECT: + pre = torch.Tensor(scores) + + for sid in utils.koboldai_vars.sampler_order: + warper = Warper.from_id(sid) + + if not warper.value_is_valid(): + continue + + if warper == warpers.RepetitionPenalty: + # Rep pen needs more data than other samplers + scores = warper.torch(scores, input_ids=input_ids.cuda()) + else: + scores = warper.torch(scores) + + assert scores is not None, f"Scores are None; warper '{warper}' is to blame" + + if LOG_SAMPLER_NO_EFFECT: + if torch.equal(pre, scores): + logger.info(warper, "had no effect on the scores.") + pre = torch.Tensor(scores) + return scores + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + seed: Optional[int] = None, + **kwargs, + ) -> GenerationResult: + if seed: + torch.manual_seed(seed) + + bad_words_ids = [self.tokenizer.bos_token_id] + if utils.koboldai_vars.use_default_badwordsids: + bad_words_ids.append(self.tokenizer.eos_token_id) + bad_words_ids.extend(self.bracket_tokens) + if single_line: + bad_words_ids.extend(self.newline_tokens) + + if not isinstance(prompt_tokens, torch.Tensor): + gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] + else: + gen_in = prompt_tokens + + self.generator._gen_begin_reuse(gen_in, None) + + for i in range(max_new): + logits = self.model.forward(self.generator.sequence_ids[:, -1:], self.generator.cache) + for bad_word_id in bad_words_ids: + logits[:, :, bad_word_id] = -10000.0 + + logits = torch.unsqueeze(logits[0, -1, :], 0) + + scores = self._apply_warpers(logits, gen_in) + + scores = torch.softmax(scores, dim=-1) + + # Work around a bug in torch.multinomial (https://github.com/pytorch/pytorch/issues/48841) + # With low probability, multinomial can return an element with zero weight. Since this + # happens infrequently, just sample repeatedly until all tokens have non-zero probability. + for _ in range(100): + token = torch.multinomial(scores, 1) + # Verify that all selected tokens correspond to positive probabilities. + if (scores.gather(1, token) > 0).all(): + break + + if (token == self.tokenizer.eos_token_id).any(): + break + + if self.generator.sequence_ids is None: + self.generator.sequence_ids = token + else: + self.generator.sequence_ids = torch.cat([self.generator.sequence_ids, token.cpu()], dim=1) + + self._post_token_gen(self.generator.sequence_ids) + + utils.koboldai_vars.generated_tkns += 1 + + # Apply stoppers + do_stop = False + for stopper in self.stopper_hooks: + do_stop = stopper(self, self.generator.sequence_ids) + if do_stop: + break + if do_stop: + break + + seq = self.generator.sequence_ids[:, gen_in.size(1):] + + return GenerationResult( + model=self, + out_batches=np.array(seq,), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) + + def _get_model(self, location: str, tf_kwargs: Dict): + if not self.model_config: + self.model_config = ExLlamaV2Config() + self.model_config.model_dir = location + self.model_config.prepare() + + # self.model_config.gpu_peer_fix = True + return ExLlamaV2(self.model_config) + + def _get_tokenizer(self, location: str): + tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location)) + return tokenizer + + def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}): + requested_parameters = [] + gpu_count = torch.cuda.device_count() + layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None + + requested_parameters.append({ + "uitype": "slider", + "unit": "int", + "label": "Maximum Context", + "id": "max_ctx", + "min": 2048, + "max": 16384, + "step": 512, + "default": 2048, + "tooltip": "The maximum context size the model supports", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + + requested_parameters.append({ + "uitype": "slider", + "unit": "float", + "label": "Embedding Compression", + "id": "compress_emb", + "min": 1, + "max": 8, + "step": 0.25, + "default": 1, + "tooltip": "If the model requires compressed embeddings, set them here", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + + requested_parameters.append({ + "uitype": "slider", + "unit": "float", + "label": "NTK alpha", + "id": "ntk_alpha", + "min": 1, + "max": 32, + "step": 0.25, + "default": 1, + "tooltip": "NTK alpha value", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + + return requested_parameters + + def set_input_parameters(self, parameters): + gpu_count = torch.cuda.device_count() + + self.model_config.max_seq_len = parameters["max_ctx"] + self.model_config.compress_pos_emb = parameters["compress_emb"] + self.model_config.alpha_value = parameters["ntk_alpha"] + + # Disable half2 for HIP + self.model_config.rmsnorm_no_half2 = bool(torch.version.hip) + self.model_config.rope_no_half2 = bool(torch.version.hip) + self.model_config.matmul_no_half2 = bool(torch.version.hip) + self.model_config.silu_no_half2 = bool(torch.version.hip) + + # Disable scaled_dot_product_attention if torch version < 2 + if torch.__version__.startswith("1."): + self.model_config.sdp_thd = 0 + + self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id'] + self.path = parameters['path'] if 'path' in parameters else None \ No newline at end of file From e99789a51b01887e2ed5ac300afc33f9b38de7f2 Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 03:19:03 +0200 Subject: [PATCH 11/18] Flash Attention (Linux) --- environments/huggingface.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 7f834906..682e8051 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -1,7 +1,7 @@ name: koboldai channels: - pytorch - - nvidia + - nvidia/label/cuda-11.8.0 - conda-forge - defaults dependencies: @@ -13,6 +13,7 @@ dependencies: - pytorch=2.0.* - python=3.8.* - pytorch-cuda=11.8 + - cuda-nvcc=11.8 - eventlet=0.33.3 - dnspython=2.2.1 - markdown @@ -59,3 +60,4 @@ dependencies: - windows-curses; sys_platform == 'win32' - pynvml - xformers==0.0.21 + - https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiTRUE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' From 04cc322d7cdc066f12287e2faa550b9d54b7cc9a Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 03:24:01 +0200 Subject: [PATCH 12/18] New dependencies --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index 65e68b07..c76044ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -47,3 +47,6 @@ https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8' windows-curses; sys_platform == 'win32' pynvml +flash_attn==2.3.0 +xformers==0.0.21 +exllamav2==0.0.4 \ No newline at end of file From e238a1c9f6775756e7eec5d4bc0a057364a806ab Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 09:09:04 +0200 Subject: [PATCH 13/18] Ship exllamav2 --- environments/huggingface.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 682e8051..265e813f 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -14,6 +14,7 @@ dependencies: - python=3.8.* - pytorch-cuda=11.8 - cuda-nvcc=11.8 + - cuda-libraries-dev=11.8 - eventlet=0.33.3 - dnspython=2.2.1 - markdown @@ -57,6 +58,8 @@ dependencies: - scipy - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - windows-curses; sys_platform == 'win32' - pynvml - xformers==0.0.21 From fa8d9e65ffea258e5723f098023826d7f481f5bb Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 09:51:09 +0200 Subject: [PATCH 14/18] Fix flash-attn --- environments/huggingface.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 265e813f..3d031194 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -63,4 +63,4 @@ dependencies: - windows-curses; sys_platform == 'win32' - pynvml - xformers==0.0.21 - - https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiTRUE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' From b141ee015566ad1206e93d995acba9dbc213ad64 Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 18:01:34 +0200 Subject: [PATCH 15/18] Universal downloader for more backends --- modeling/inference_models/basic_hf/class.py | 7 +++++++ modeling/inference_models/exllama/class.py | 6 ++++++ modeling/inference_models/exllamav2/class.py | 7 ++++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/modeling/inference_models/basic_hf/class.py b/modeling/inference_models/basic_hf/class.py index afca13ee..5ae2aa0d 100644 --- a/modeling/inference_models/basic_hf/class.py +++ b/modeling/inference_models/basic_hf/class.py @@ -148,6 +148,13 @@ class model_backend(InferenceModel): self.get_local_model_path(ignore_existance=True), ) + if not self.get_local_model_path(): + print(self.get_local_model_path()) + from huggingface_hub import snapshot_download + target_dir = "models/" + self.model_name.replace("/", "_") + print(self.model_name) + snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision) + self.init_model_config() self.model = AutoModelForCausalLM.from_pretrained( diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 4539b7a3..f688d611 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -128,6 +128,12 @@ class model_backend(InferenceModel): return config def _load(self, save_model: bool, initial_load: bool) -> None: + if not self.get_local_model_path(): + from huggingface_hub import snapshot_download + target_dir = "models/" + self.model_name.replace("/", "_") + print(self.model_name) + snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision) + self.model = self._get_model(self.get_local_model_path(), {}) self.tokenizer = self._get_tokenizer(self.get_local_model_path()) diff --git a/modeling/inference_models/exllamav2/class.py b/modeling/inference_models/exllamav2/class.py index 95795506..15b91c8d 100644 --- a/modeling/inference_models/exllamav2/class.py +++ b/modeling/inference_models/exllamav2/class.py @@ -36,7 +36,7 @@ try: except: load_failed = True -model_backend_type = "Exl2" +model_backend_type = "GPTQ" model_backend_name = "ExLlama V2" # When set to true, messages will appear in the console if samplers are not @@ -99,6 +99,11 @@ class model_backend(InferenceModel): return config def _load(self, save_model: bool, initial_load: bool) -> None: + if not self.get_local_model_path(): + from huggingface_hub import snapshot_download + target_dir = "models/" + self.model_name.replace("/", "_") + print(self.model_name) + snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision) self.model = self._get_model(self.get_local_model_path(), {}) #TODO support GPU split self.model.load(None) From 02d3b00ff120d7dc8b1bd61e2e3e629c78c69184 Mon Sep 17 00:00:00 2001 From: Henk Date: Wed, 4 Oct 2023 00:18:21 +0200 Subject: [PATCH 16/18] HF 4.34 --- environments/huggingface.yml | 8 ++++---- environments/ipex.yml | 6 +++--- environments/rocm.yml | 7 ++++--- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 3d031194..8bba67be 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -33,9 +33,9 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers[sentencepiece]==4.33.1 + - transformers[sentencepiece]==4.34.0 - huggingface_hub==0.16.4 - - optimum[onnxruntime]==1.12.0 + - optimum[onnxruntime]==1.13.2 - safetensors==0.3.3 - accelerate==0.21.0 - git+https://github.com/VE-FORBRYDERNE/mkultra @@ -51,8 +51,8 @@ dependencies: - git+https://github.com/0cc4m/hf_bleeding_edge/ - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - einops - peft==0.3.0 - scipy diff --git a/environments/ipex.yml b/environments/ipex.yml index 73b6025d..2dd0bb3e 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -33,7 +33,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers[sentencepiece]==4.33.1 + - transformers[sentencepiece]==4.34.0 - huggingface_hub==0.16.4 - optimum[openvino,nncf,neural-compressor]==1.12.0 - safetensors==0.3.3 @@ -49,8 +49,8 @@ dependencies: - git+https://github.com/0cc4m/hf_bleeding_edge/ - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - einops - peft==0.3.0 - scipy diff --git a/environments/rocm.yml b/environments/rocm.yml index 5b56bef7..814a682f 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -29,9 +29,9 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers[sentencepiece]==4.33.1 + - transformers[sentencepiece]==4.34.0 - huggingface_hub==0.16.4 - - optimum[onnxruntime]==1.12.0 + - optimum[onnxruntime]==1.13.2 - safetensors==0.3.3 - accelerate==0.21.0 - git+https://github.com/VE-FORBRYDERNE/mkultra @@ -45,4 +45,5 @@ dependencies: - einops - peft==0.3.0 - windows-curses; sys_platform == 'win32' - - pynvml \ No newline at end of file + - pynvml + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+rocm5.4.2-cp38-cp38-linux_x86_64.whl \ No newline at end of file From caa040b416c3d3467d99de2a055e707e4af36398 Mon Sep 17 00:00:00 2001 From: Henk Date: Wed, 4 Oct 2023 00:21:32 +0200 Subject: [PATCH 17/18] HF 4.34 part 2 --- requirements.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index c76044ed..5a19a292 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -transformers[sentencepiece]==4.33.1 +transformers[sentencepiece]==4.34.0 huggingface_hub==0.16.4 -optimum[onnxruntime]==1.12.0 +optimum[onnxruntime]==1.13.2 safetensors==0.3.3 Flask==2.3.3 Flask-SocketIO==5.3.2 @@ -41,10 +41,10 @@ git+https://github.com/0cc4m/hf_bleeding_edge/ einops peft==0.3.0 scipy -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10' -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10' -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8' -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8' windows-curses; sys_platform == 'win32' pynvml flash_attn==2.3.0 From 5a3986fb06266b4a84ac7a35e30782c709da6bab Mon Sep 17 00:00:00 2001 From: Disty0 Date: Wed, 4 Oct 2023 01:32:36 +0300 Subject: [PATCH 18/18] IPEX Optimizations --- environments/ipex.yml | 4 ++-- modeling/ipex/__init__.py | 9 +++++++-- modeling/ipex/attention.py | 33 +++++++++++++++++++-------------- modeling/ipex/diffusers.py | 9 +++++---- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/environments/ipex.yml b/environments/ipex.yml index 2dd0bb3e..88c1d140 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -23,7 +23,7 @@ dependencies: - Pillow - psutil - pip: - - -f https://developer.intel.com/ipex-whl-stable-xpu + - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - torch==2.0.1a0; sys_platform == 'linux' - torch==2.0.0a0; sys_platform == 'win32' - intel_extension_for_pytorch==2.0.110+xpu; sys_platform == 'linux' @@ -35,7 +35,7 @@ dependencies: - lupa==1.10 - transformers[sentencepiece]==4.34.0 - huggingface_hub==0.16.4 - - optimum[openvino,nncf,neural-compressor]==1.12.0 + - optimum[onnxruntime,openvino,nncf,neural-compressor]==1.13.2 - safetensors==0.3.3 - accelerate==0.21.0 - git+https://github.com/VE-FORBRYDERNE/mkultra diff --git a/modeling/ipex/__init__.py b/modeling/ipex/__init__.py index 9ec69012..43accd9f 100644 --- a/modeling/ipex/__init__.py +++ b/modeling/ipex/__init__.py @@ -16,7 +16,6 @@ def ipex_init(): # pylint: disable=too-many-statements torch.cuda.device = torch.xpu.device torch.cuda.device_count = torch.xpu.device_count torch.cuda.device_of = torch.xpu.device_of - torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard torch.cuda.get_device_name = torch.xpu.get_device_name torch.cuda.get_device_properties = torch.xpu.get_device_properties torch.cuda.init = torch.xpu.init @@ -145,7 +144,7 @@ def ipex_init(): # pylint: disable=too-many-statements ipex._C._DeviceProperties.minor = 2 #Fix functions with ipex: - torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory] + torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory] torch._utils._get_available_device_type = lambda: "xpu" torch.has_cuda = True torch.cuda.has_half = True @@ -157,6 +156,12 @@ def ipex_init(): # pylint: disable=too-many-statements torch.cuda.get_device_properties.minor = 7 torch.cuda.ipc_collect = lambda *args, **kwargs: None torch.cuda.utilization = lambda *args, **kwargs: 0 + if hasattr(torch.xpu, 'getDeviceIdListForCard'): + torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard + torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard + else: + torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card + torch.cuda.get_device_id_list_per_card = torch.xpu.get_device_id_list_per_card ipex_hijacks() attention_init() diff --git a/modeling/ipex/attention.py b/modeling/ipex/attention.py index e38689f2..84848b6a 100644 --- a/modeling/ipex/attention.py +++ b/modeling/ipex/attention.py @@ -10,13 +10,15 @@ def torch_bmm(input, mat2, *, out=None): #ARC GPUs can't allocate more than 4GB to a single block, Slice it: batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2] - block_multiply = 2.4 if input.dtype == torch.float32 else 1.2 - block_size = (batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply #MB + block_multiply = input.element_size() + slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply + block_size = batch_size_attention * slice_block_size + split_slice_size = batch_size_attention - if block_size >= 4000: + if block_size > 4: do_split = True #Find something divisible with the input_tokens - while ((split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply) > 4000: + while (split_slice_size * slice_block_size) > 4: split_slice_size = split_slice_size // 2 if split_slice_size <= 1: split_slice_size = 1 @@ -24,12 +26,12 @@ def torch_bmm(input, mat2, *, out=None): else: do_split = False - split_block_size = (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply #MB split_2_slice_size = input_tokens - if split_block_size >= 4000: + if split_slice_size * slice_block_size > 4: + slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply do_split_2 = True #Find something divisible with the input_tokens - while ((split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply) > 4000: + while (split_2_slice_size * slice_block_size2) > 4: split_2_slice_size = split_2_slice_size // 2 if split_2_slice_size <= 1: split_2_slice_size = 1 @@ -71,13 +73,16 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. else: shape_one, batch_size_attention, query_tokens, shape_four = query.shape no_shape_one = False - block_multiply = 3.6 if query.dtype == torch.float32 else 1.8 - block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB + + block_multiply = query.element_size() + slice_block_size = shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply + block_size = batch_size_attention * slice_block_size + split_slice_size = batch_size_attention - if block_size >= 4000: + if block_size > 4: do_split = True #Find something divisible with the shape_one - while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000: + while (split_slice_size * slice_block_size) > 4: split_slice_size = split_slice_size // 2 if split_slice_size <= 1: split_slice_size = 1 @@ -85,12 +90,12 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. else: do_split = False - split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB split_2_slice_size = query_tokens - if split_block_size >= 4000: + if split_slice_size * slice_block_size > 4: + slice_block_size2 = shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply do_split_2 = True #Find something divisible with the batch_size_attention - while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000: + while (split_2_slice_size * slice_block_size2) > 4: split_2_slice_size = split_2_slice_size // 2 if split_2_slice_size <= 1: split_2_slice_size = 1 diff --git a/modeling/ipex/diffusers.py b/modeling/ipex/diffusers.py index 4c39896e..005ee49f 100644 --- a/modeling/ipex/diffusers.py +++ b/modeling/ipex/diffusers.py @@ -55,13 +55,14 @@ class SlicedAttnProcessor: # pylint: disable=too-few-public-methods ) #ARC GPUs can't allocate more than 4GB to a single block, Slice it: - block_multiply = 2.4 if query.dtype == torch.float32 else 1.2 - block_size = (batch_size_attention * query_tokens * shape_three) / 1024 * block_multiply #MB + block_multiply = query.element_size() + slice_block_size = self.slice_size * shape_three / 1024 / 1024 * block_multiply + block_size = query_tokens * slice_block_size split_2_slice_size = query_tokens - if block_size >= 4000: + if block_size > 4: do_split_2 = True #Find something divisible with the query_tokens - while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000: + while (split_2_slice_size * slice_block_size) > 4: split_2_slice_size = split_2_slice_size // 2 if split_2_slice_size <= 1: split_2_slice_size = 1