From 9e51a50bcd6c4e287d4cd27a9c3a8303f8416cf4 Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 01:08:53 +0200 Subject: [PATCH 01/10] Llama fixes for Mistral --- modeling/inference_models/hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 7e291b93..8cb52d69 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -232,7 +232,7 @@ class HFInferenceModel(InferenceModel): self.model_type = str(self.model_config.model_type) # These are model specific tokenizer overrides if a model has bad defaults - if self.model_type == "llama": + if self.model_type == "llama" or self.model_type == "mistral": # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer self.tokenizer.add_bos_token = False self.tokenizer.legacy = False From 6fdf83aad5f93559f8c332c7c2521e6058f09883 Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 01:48:38 +0200 Subject: [PATCH 02/10] Basic Exllama2 --- modeling/inference_models/exllamav2/class.py | 417 +++++++++++++++++++ 1 file changed, 417 insertions(+) create mode 100644 modeling/inference_models/exllamav2/class.py diff --git a/modeling/inference_models/exllamav2/class.py b/modeling/inference_models/exllamav2/class.py new file mode 100644 index 00000000..95795506 --- /dev/null +++ b/modeling/inference_models/exllamav2/class.py @@ -0,0 +1,417 @@ +from __future__ import annotations +try: + import time, json + import torch + import requests + import numpy as np + from typing import List, Optional, Union + import os + import glob + from pathlib import Path + import re + import warnings + import gc + + import utils + from logger import logger + + from modeling import warpers + from modeling.warpers import Warper + from modeling.stoppers import Stoppers + from modeling.post_token_hooks import PostTokenHooks + from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, + ModelCapabilities, + ) + + from modeling.tokenizer import GenericTokenizer + + + from exllamav2.model import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config + from transformers import LlamaTokenizer + from exllamav2.generator import ExLlamaV2StreamingGenerator + load_failed = False +except: + load_failed = True + +model_backend_type = "Exl2" +model_backend_name = "ExLlama V2" + +# When set to true, messages will appear in the console if samplers are not +# changing the scores. Keep in mind some samplers don't always change the +# scores for each token. +LOG_SAMPLER_NO_EFFECT = False + +class model_backend(InferenceModel): + def __init__(self) -> None: + super().__init__() + self.model_config = None + + self.model = None + self.tokenizer = None + self.cache = None + self.generator = None + + self.model_name = "" + self.path = None + + self.post_token_hooks = [ + PostTokenHooks.stream_tokens, + ] + + self.stopper_hooks = [ + Stoppers.core_stopper, + Stoppers.dynamic_wi_scanner, + Stoppers.singleline_stopper, + Stoppers.chat_mode_stopper, + Stoppers.stop_sequence_stopper, + ] + + self.capabilties = ModelCapabilities( + embedding_manipulation=False, + post_token_hooks=True, + stopper_hooks=True, + post_token_probs=False, + ) + self.disable = load_failed + + def is_valid(self, model_name, model_path, menu_path): + try: + self.model_config = self._load_config(model_name, model_path) + #TODO check if model is valid + return True + except: + return False + + def get_local_model_path(self): + return self.path or os.path.join("models", self.model_name.replace("/", "_")) + + def _load_config(self, model_name, model_path): + config = ExLlamaV2Config() + if model_path is not None and os.path.exists(model_path): + config.model_dir = model_path + elif os.path.exists("models/{}".format(model_name.replace('/', '_'))): + config.model_dir = "models/{}".format(model_name.replace('/', '_')) + config.prepare() + + return config + + def _load(self, save_model: bool, initial_load: bool) -> None: + self.model = self._get_model(self.get_local_model_path(), {}) + #TODO support GPU split + self.model.load(None) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + + self.cache = ExLlamaV2Cache(self.model) + + self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer.tokenizer) + + def _post_load(self) -> None: + # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer + self.tokenizer.add_bos_token = False + + # HF transformers no longer supports decode_with_prefix_space + # We work around this by wrapping decode, encode, and __call__ + # with versions that work around the 'prefix space' misfeature + # of sentencepiece. + vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size)) + has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")} + + # Wrap 'decode' with a method that always returns text starting with a space + # when the head token starts with a space. This is what 'decode_with_prefix_space' + # used to do, and we implement it using the same technique (building a cache of + # tokens that should have a prefix space, and then prepending a space if the first + # token is in this set.) We also work around a bizarre behavior in which decoding + # a single token 13 behaves differently than decoding a squence containing only [13]. + original_decode = type(self.tokenizer.tokenizer).decode + def decode_wrapper(self, token_ids, *args, **kwargs): + first = None + # Note, the code below that wraps single-value token_ids in a list + # is to work around this wonky behavior: + # >>> t.decode(13) + # '<0x0A>' + # >>> t.decode([13]) + # '\n' + # Not doing this causes token streaming to receive <0x0A> characters + # instead of newlines. + if isinstance(token_ids, int): + first = token_ids + token_ids = [first] + elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor + # Tensors don't support the Python standard of 'empty is False' + # and the special case of dimension 0 tensors also needs to be + # handled separately. + if token_ids.dim() == 0: + first = int(token_ids.item()) + token_ids = [first] + elif len(token_ids) > 0: + first = int(token_ids[0]) + elif token_ids is not None and len(token_ids) > 0: + first = token_ids[0] + result = original_decode(self, token_ids, *args, **kwargs) + if first is not None and first in has_prefix_space: + result = " " + result + return result + # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it + object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer)) + + # Wrap encode and __call__ to work around the 'prefix space' misfeature also. + # The problem is that "Bob" at the start of text is encoded as if it is + # " Bob". This creates a problem because it means you can't split text, encode + # the pieces, concatenate the tokens, decode them, and get the original text back. + # The workaround is to prepend a known token that (1) starts with a space; and + # (2) is not the prefix of any other token. After searching through the vocab + # " ," (space comma) is the only token containing only printable ascii characters + # that fits this bill. By prepending ',' to the text, the original encode + # method always returns [1919, ...], where the tail of the sequence is the + # actual encoded result we want without the prefix space behavior. + original_encode = type(self.tokenizer.tokenizer).encode + def encode_wrapper(self, text, *args, **kwargs): + if type(text) is str: + text = ',' + text + result = original_encode(self, text, *args, **kwargs) + result = result[1:] + else: + result = original_encode(self, text, *args, **kwargs) + return result + object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer)) + + # Since 'encode' is documented as being deprecated, also override __call__. + # This doesn't appear to currently be used by KoboldAI, but doing so + # in case someone uses it in the future. + original_call = type(self.tokenizer.tokenizer).__call__ + def call_wrapper(self, text, *args, **kwargs): + if type(text) is str: + text = ',' + text + result = original_call(self, text, *args, **kwargs) + result = result[1:] + else: + result = original_call(self, text, *args, **kwargs) + return result + object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer)) + + # Cache the newline token (for single line mode) + # Since there is only one Llama token containing newline, just encode \n + self.newline_tokens = self.tokenizer.encode("\n") + self.bracket_tokens = [i for i, tok in enumerate(vocab) if '[' in tok or ']' in tok] + self.tokenizer._koboldai_header = self.tokenizer.encode("") + + def unload(self): + #self.model_config = None # This breaks more than it fixes - Henk + + self.model = None + self.tokenizer = None + self.cache = None + self.generator = None + + self.model_name = "" + self.path = None + + with torch.no_grad(): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated") + for tensor in gc.get_objects(): + try: + if torch.is_tensor(tensor): + tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype)) + except: + pass + gc.collect() + try: + with torch.no_grad(): + torch.cuda.empty_cache() + except: + pass + + def _apply_warpers( + self, scores: torch.Tensor, input_ids: torch.Tensor + ) -> torch.Tensor: + warpers.update_settings() + + if LOG_SAMPLER_NO_EFFECT: + pre = torch.Tensor(scores) + + for sid in utils.koboldai_vars.sampler_order: + warper = Warper.from_id(sid) + + if not warper.value_is_valid(): + continue + + if warper == warpers.RepetitionPenalty: + # Rep pen needs more data than other samplers + scores = warper.torch(scores, input_ids=input_ids.cuda()) + else: + scores = warper.torch(scores) + + assert scores is not None, f"Scores are None; warper '{warper}' is to blame" + + if LOG_SAMPLER_NO_EFFECT: + if torch.equal(pre, scores): + logger.info(warper, "had no effect on the scores.") + pre = torch.Tensor(scores) + return scores + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + seed: Optional[int] = None, + **kwargs, + ) -> GenerationResult: + if seed: + torch.manual_seed(seed) + + bad_words_ids = [self.tokenizer.bos_token_id] + if utils.koboldai_vars.use_default_badwordsids: + bad_words_ids.append(self.tokenizer.eos_token_id) + bad_words_ids.extend(self.bracket_tokens) + if single_line: + bad_words_ids.extend(self.newline_tokens) + + if not isinstance(prompt_tokens, torch.Tensor): + gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] + else: + gen_in = prompt_tokens + + self.generator._gen_begin_reuse(gen_in, None) + + for i in range(max_new): + logits = self.model.forward(self.generator.sequence_ids[:, -1:], self.generator.cache) + for bad_word_id in bad_words_ids: + logits[:, :, bad_word_id] = -10000.0 + + logits = torch.unsqueeze(logits[0, -1, :], 0) + + scores = self._apply_warpers(logits, gen_in) + + scores = torch.softmax(scores, dim=-1) + + # Work around a bug in torch.multinomial (https://github.com/pytorch/pytorch/issues/48841) + # With low probability, multinomial can return an element with zero weight. Since this + # happens infrequently, just sample repeatedly until all tokens have non-zero probability. + for _ in range(100): + token = torch.multinomial(scores, 1) + # Verify that all selected tokens correspond to positive probabilities. + if (scores.gather(1, token) > 0).all(): + break + + if (token == self.tokenizer.eos_token_id).any(): + break + + if self.generator.sequence_ids is None: + self.generator.sequence_ids = token + else: + self.generator.sequence_ids = torch.cat([self.generator.sequence_ids, token.cpu()], dim=1) + + self._post_token_gen(self.generator.sequence_ids) + + utils.koboldai_vars.generated_tkns += 1 + + # Apply stoppers + do_stop = False + for stopper in self.stopper_hooks: + do_stop = stopper(self, self.generator.sequence_ids) + if do_stop: + break + if do_stop: + break + + seq = self.generator.sequence_ids[:, gen_in.size(1):] + + return GenerationResult( + model=self, + out_batches=np.array(seq,), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) + + def _get_model(self, location: str, tf_kwargs: Dict): + if not self.model_config: + self.model_config = ExLlamaV2Config() + self.model_config.model_dir = location + self.model_config.prepare() + + # self.model_config.gpu_peer_fix = True + return ExLlamaV2(self.model_config) + + def _get_tokenizer(self, location: str): + tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location)) + return tokenizer + + def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}): + requested_parameters = [] + gpu_count = torch.cuda.device_count() + layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None + + requested_parameters.append({ + "uitype": "slider", + "unit": "int", + "label": "Maximum Context", + "id": "max_ctx", + "min": 2048, + "max": 16384, + "step": 512, + "default": 2048, + "tooltip": "The maximum context size the model supports", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + + requested_parameters.append({ + "uitype": "slider", + "unit": "float", + "label": "Embedding Compression", + "id": "compress_emb", + "min": 1, + "max": 8, + "step": 0.25, + "default": 1, + "tooltip": "If the model requires compressed embeddings, set them here", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + + requested_parameters.append({ + "uitype": "slider", + "unit": "float", + "label": "NTK alpha", + "id": "ntk_alpha", + "min": 1, + "max": 32, + "step": 0.25, + "default": 1, + "tooltip": "NTK alpha value", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + + return requested_parameters + + def set_input_parameters(self, parameters): + gpu_count = torch.cuda.device_count() + + self.model_config.max_seq_len = parameters["max_ctx"] + self.model_config.compress_pos_emb = parameters["compress_emb"] + self.model_config.alpha_value = parameters["ntk_alpha"] + + # Disable half2 for HIP + self.model_config.rmsnorm_no_half2 = bool(torch.version.hip) + self.model_config.rope_no_half2 = bool(torch.version.hip) + self.model_config.matmul_no_half2 = bool(torch.version.hip) + self.model_config.silu_no_half2 = bool(torch.version.hip) + + # Disable scaled_dot_product_attention if torch version < 2 + if torch.__version__.startswith("1."): + self.model_config.sdp_thd = 0 + + self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id'] + self.path = parameters['path'] if 'path' in parameters else None \ No newline at end of file From e99789a51b01887e2ed5ac300afc33f9b38de7f2 Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 03:19:03 +0200 Subject: [PATCH 03/10] Flash Attention (Linux) --- environments/huggingface.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 7f834906..682e8051 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -1,7 +1,7 @@ name: koboldai channels: - pytorch - - nvidia + - nvidia/label/cuda-11.8.0 - conda-forge - defaults dependencies: @@ -13,6 +13,7 @@ dependencies: - pytorch=2.0.* - python=3.8.* - pytorch-cuda=11.8 + - cuda-nvcc=11.8 - eventlet=0.33.3 - dnspython=2.2.1 - markdown @@ -59,3 +60,4 @@ dependencies: - windows-curses; sys_platform == 'win32' - pynvml - xformers==0.0.21 + - https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiTRUE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' From 04cc322d7cdc066f12287e2faa550b9d54b7cc9a Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 03:24:01 +0200 Subject: [PATCH 04/10] New dependencies --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index 65e68b07..c76044ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -47,3 +47,6 @@ https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8' windows-curses; sys_platform == 'win32' pynvml +flash_attn==2.3.0 +xformers==0.0.21 +exllamav2==0.0.4 \ No newline at end of file From e238a1c9f6775756e7eec5d4bc0a057364a806ab Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 09:09:04 +0200 Subject: [PATCH 05/10] Ship exllamav2 --- environments/huggingface.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 682e8051..265e813f 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -14,6 +14,7 @@ dependencies: - python=3.8.* - pytorch-cuda=11.8 - cuda-nvcc=11.8 + - cuda-libraries-dev=11.8 - eventlet=0.33.3 - dnspython=2.2.1 - markdown @@ -57,6 +58,8 @@ dependencies: - scipy - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - windows-curses; sys_platform == 'win32' - pynvml - xformers==0.0.21 From fa8d9e65ffea258e5723f098023826d7f481f5bb Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 09:51:09 +0200 Subject: [PATCH 06/10] Fix flash-attn --- environments/huggingface.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 265e813f..3d031194 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -63,4 +63,4 @@ dependencies: - windows-curses; sys_platform == 'win32' - pynvml - xformers==0.0.21 - - https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiTRUE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' From b141ee015566ad1206e93d995acba9dbc213ad64 Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 28 Sep 2023 18:01:34 +0200 Subject: [PATCH 07/10] Universal downloader for more backends --- modeling/inference_models/basic_hf/class.py | 7 +++++++ modeling/inference_models/exllama/class.py | 6 ++++++ modeling/inference_models/exllamav2/class.py | 7 ++++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/modeling/inference_models/basic_hf/class.py b/modeling/inference_models/basic_hf/class.py index afca13ee..5ae2aa0d 100644 --- a/modeling/inference_models/basic_hf/class.py +++ b/modeling/inference_models/basic_hf/class.py @@ -148,6 +148,13 @@ class model_backend(InferenceModel): self.get_local_model_path(ignore_existance=True), ) + if not self.get_local_model_path(): + print(self.get_local_model_path()) + from huggingface_hub import snapshot_download + target_dir = "models/" + self.model_name.replace("/", "_") + print(self.model_name) + snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision) + self.init_model_config() self.model = AutoModelForCausalLM.from_pretrained( diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 4539b7a3..f688d611 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -128,6 +128,12 @@ class model_backend(InferenceModel): return config def _load(self, save_model: bool, initial_load: bool) -> None: + if not self.get_local_model_path(): + from huggingface_hub import snapshot_download + target_dir = "models/" + self.model_name.replace("/", "_") + print(self.model_name) + snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision) + self.model = self._get_model(self.get_local_model_path(), {}) self.tokenizer = self._get_tokenizer(self.get_local_model_path()) diff --git a/modeling/inference_models/exllamav2/class.py b/modeling/inference_models/exllamav2/class.py index 95795506..15b91c8d 100644 --- a/modeling/inference_models/exllamav2/class.py +++ b/modeling/inference_models/exllamav2/class.py @@ -36,7 +36,7 @@ try: except: load_failed = True -model_backend_type = "Exl2" +model_backend_type = "GPTQ" model_backend_name = "ExLlama V2" # When set to true, messages will appear in the console if samplers are not @@ -99,6 +99,11 @@ class model_backend(InferenceModel): return config def _load(self, save_model: bool, initial_load: bool) -> None: + if not self.get_local_model_path(): + from huggingface_hub import snapshot_download + target_dir = "models/" + self.model_name.replace("/", "_") + print(self.model_name) + snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision) self.model = self._get_model(self.get_local_model_path(), {}) #TODO support GPU split self.model.load(None) From 02d3b00ff120d7dc8b1bd61e2e3e629c78c69184 Mon Sep 17 00:00:00 2001 From: Henk Date: Wed, 4 Oct 2023 00:18:21 +0200 Subject: [PATCH 08/10] HF 4.34 --- environments/huggingface.yml | 8 ++++---- environments/ipex.yml | 6 +++--- environments/rocm.yml | 7 ++++--- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 3d031194..8bba67be 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -33,9 +33,9 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers[sentencepiece]==4.33.1 + - transformers[sentencepiece]==4.34.0 - huggingface_hub==0.16.4 - - optimum[onnxruntime]==1.12.0 + - optimum[onnxruntime]==1.13.2 - safetensors==0.3.3 - accelerate==0.21.0 - git+https://github.com/VE-FORBRYDERNE/mkultra @@ -51,8 +51,8 @@ dependencies: - git+https://github.com/0cc4m/hf_bleeding_edge/ - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - einops - peft==0.3.0 - scipy diff --git a/environments/ipex.yml b/environments/ipex.yml index 73b6025d..2dd0bb3e 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -33,7 +33,7 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers[sentencepiece]==4.33.1 + - transformers[sentencepiece]==4.34.0 - huggingface_hub==0.16.4 - optimum[openvino,nncf,neural-compressor]==1.12.0 - safetensors==0.3.3 @@ -49,8 +49,8 @@ dependencies: - git+https://github.com/0cc4m/hf_bleeding_edge/ - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - einops - peft==0.3.0 - scipy diff --git a/environments/rocm.yml b/environments/rocm.yml index 5b56bef7..814a682f 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -29,9 +29,9 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers[sentencepiece]==4.33.1 + - transformers[sentencepiece]==4.34.0 - huggingface_hub==0.16.4 - - optimum[onnxruntime]==1.12.0 + - optimum[onnxruntime]==1.13.2 - safetensors==0.3.3 - accelerate==0.21.0 - git+https://github.com/VE-FORBRYDERNE/mkultra @@ -45,4 +45,5 @@ dependencies: - einops - peft==0.3.0 - windows-curses; sys_platform == 'win32' - - pynvml \ No newline at end of file + - pynvml + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+rocm5.4.2-cp38-cp38-linux_x86_64.whl \ No newline at end of file From caa040b416c3d3467d99de2a055e707e4af36398 Mon Sep 17 00:00:00 2001 From: Henk Date: Wed, 4 Oct 2023 00:21:32 +0200 Subject: [PATCH 09/10] HF 4.34 part 2 --- requirements.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index c76044ed..5a19a292 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -transformers[sentencepiece]==4.33.1 +transformers[sentencepiece]==4.34.0 huggingface_hub==0.16.4 -optimum[onnxruntime]==1.12.0 +optimum[onnxruntime]==1.13.2 safetensors==0.3.3 Flask==2.3.3 Flask-SocketIO==5.3.2 @@ -41,10 +41,10 @@ git+https://github.com/0cc4m/hf_bleeding_edge/ einops peft==0.3.0 scipy -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10' -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10' -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8' -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8' windows-curses; sys_platform == 'win32' pynvml flash_attn==2.3.0 From 5a3986fb06266b4a84ac7a35e30782c709da6bab Mon Sep 17 00:00:00 2001 From: Disty0 Date: Wed, 4 Oct 2023 01:32:36 +0300 Subject: [PATCH 10/10] IPEX Optimizations --- environments/ipex.yml | 4 ++-- modeling/ipex/__init__.py | 9 +++++++-- modeling/ipex/attention.py | 33 +++++++++++++++++++-------------- modeling/ipex/diffusers.py | 9 +++++---- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/environments/ipex.yml b/environments/ipex.yml index 2dd0bb3e..88c1d140 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -23,7 +23,7 @@ dependencies: - Pillow - psutil - pip: - - -f https://developer.intel.com/ipex-whl-stable-xpu + - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ - torch==2.0.1a0; sys_platform == 'linux' - torch==2.0.0a0; sys_platform == 'win32' - intel_extension_for_pytorch==2.0.110+xpu; sys_platform == 'linux' @@ -35,7 +35,7 @@ dependencies: - lupa==1.10 - transformers[sentencepiece]==4.34.0 - huggingface_hub==0.16.4 - - optimum[openvino,nncf,neural-compressor]==1.12.0 + - optimum[onnxruntime,openvino,nncf,neural-compressor]==1.13.2 - safetensors==0.3.3 - accelerate==0.21.0 - git+https://github.com/VE-FORBRYDERNE/mkultra diff --git a/modeling/ipex/__init__.py b/modeling/ipex/__init__.py index 9ec69012..43accd9f 100644 --- a/modeling/ipex/__init__.py +++ b/modeling/ipex/__init__.py @@ -16,7 +16,6 @@ def ipex_init(): # pylint: disable=too-many-statements torch.cuda.device = torch.xpu.device torch.cuda.device_count = torch.xpu.device_count torch.cuda.device_of = torch.xpu.device_of - torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard torch.cuda.get_device_name = torch.xpu.get_device_name torch.cuda.get_device_properties = torch.xpu.get_device_properties torch.cuda.init = torch.xpu.init @@ -145,7 +144,7 @@ def ipex_init(): # pylint: disable=too-many-statements ipex._C._DeviceProperties.minor = 2 #Fix functions with ipex: - torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory] + torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory] torch._utils._get_available_device_type = lambda: "xpu" torch.has_cuda = True torch.cuda.has_half = True @@ -157,6 +156,12 @@ def ipex_init(): # pylint: disable=too-many-statements torch.cuda.get_device_properties.minor = 7 torch.cuda.ipc_collect = lambda *args, **kwargs: None torch.cuda.utilization = lambda *args, **kwargs: 0 + if hasattr(torch.xpu, 'getDeviceIdListForCard'): + torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard + torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard + else: + torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card + torch.cuda.get_device_id_list_per_card = torch.xpu.get_device_id_list_per_card ipex_hijacks() attention_init() diff --git a/modeling/ipex/attention.py b/modeling/ipex/attention.py index e38689f2..84848b6a 100644 --- a/modeling/ipex/attention.py +++ b/modeling/ipex/attention.py @@ -10,13 +10,15 @@ def torch_bmm(input, mat2, *, out=None): #ARC GPUs can't allocate more than 4GB to a single block, Slice it: batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2] - block_multiply = 2.4 if input.dtype == torch.float32 else 1.2 - block_size = (batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply #MB + block_multiply = input.element_size() + slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply + block_size = batch_size_attention * slice_block_size + split_slice_size = batch_size_attention - if block_size >= 4000: + if block_size > 4: do_split = True #Find something divisible with the input_tokens - while ((split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply) > 4000: + while (split_slice_size * slice_block_size) > 4: split_slice_size = split_slice_size // 2 if split_slice_size <= 1: split_slice_size = 1 @@ -24,12 +26,12 @@ def torch_bmm(input, mat2, *, out=None): else: do_split = False - split_block_size = (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply #MB split_2_slice_size = input_tokens - if split_block_size >= 4000: + if split_slice_size * slice_block_size > 4: + slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply do_split_2 = True #Find something divisible with the input_tokens - while ((split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply) > 4000: + while (split_2_slice_size * slice_block_size2) > 4: split_2_slice_size = split_2_slice_size // 2 if split_2_slice_size <= 1: split_2_slice_size = 1 @@ -71,13 +73,16 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. else: shape_one, batch_size_attention, query_tokens, shape_four = query.shape no_shape_one = False - block_multiply = 3.6 if query.dtype == torch.float32 else 1.8 - block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB + + block_multiply = query.element_size() + slice_block_size = shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply + block_size = batch_size_attention * slice_block_size + split_slice_size = batch_size_attention - if block_size >= 4000: + if block_size > 4: do_split = True #Find something divisible with the shape_one - while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000: + while (split_slice_size * slice_block_size) > 4: split_slice_size = split_slice_size // 2 if split_slice_size <= 1: split_slice_size = 1 @@ -85,12 +90,12 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. else: do_split = False - split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB split_2_slice_size = query_tokens - if split_block_size >= 4000: + if split_slice_size * slice_block_size > 4: + slice_block_size2 = shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply do_split_2 = True #Find something divisible with the batch_size_attention - while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000: + while (split_2_slice_size * slice_block_size2) > 4: split_2_slice_size = split_2_slice_size // 2 if split_2_slice_size <= 1: split_2_slice_size = 1 diff --git a/modeling/ipex/diffusers.py b/modeling/ipex/diffusers.py index 4c39896e..005ee49f 100644 --- a/modeling/ipex/diffusers.py +++ b/modeling/ipex/diffusers.py @@ -55,13 +55,14 @@ class SlicedAttnProcessor: # pylint: disable=too-few-public-methods ) #ARC GPUs can't allocate more than 4GB to a single block, Slice it: - block_multiply = 2.4 if query.dtype == torch.float32 else 1.2 - block_size = (batch_size_attention * query_tokens * shape_three) / 1024 * block_multiply #MB + block_multiply = query.element_size() + slice_block_size = self.slice_size * shape_three / 1024 / 1024 * block_multiply + block_size = query_tokens * slice_block_size split_2_slice_size = query_tokens - if block_size >= 4000: + if block_size > 4: do_split_2 = True #Find something divisible with the query_tokens - while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000: + while (split_2_slice_size * slice_block_size) > 4: split_2_slice_size = split_2_slice_size // 2 if split_2_slice_size <= 1: split_2_slice_size = 1