diff --git a/AI-Horde-Worker b/AI-Horde-Worker index 755696b9..3e357f4d 160000 --- a/AI-Horde-Worker +++ b/AI-Horde-Worker @@ -1 +1 @@ -Subproject commit 755696b9d4464e4167bfea5fd426686420015038 +Subproject commit 3e357f4d8b284a637564024802c22fc3b19a5ffc diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 9f3aa495..74229dbd 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -1,7 +1,7 @@ name: koboldai channels: - pytorch - - nvidia + - nvidia/label/cuda-11.8.0 - conda-forge - defaults dependencies: @@ -13,6 +13,8 @@ dependencies: - pytorch=2.0.* - python=3.8.* - pytorch-cuda=11.8 + - cuda-nvcc=11.8 + - cuda-libraries-dev=11.8 - eventlet=0.33.3 - dnspython=2.2.1 - markdown @@ -32,9 +34,9 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers[sentencepiece]==4.33.1 + - transformers[sentencepiece]==4.34.0 - huggingface_hub==0.16.4 - - optimum[onnxruntime]==1.12.0 + - optimum[onnxruntime]==1.13.2 - safetensors==0.3.3 - accelerate==0.21.0 - git+https://github.com/VE-FORBRYDERNE/mkultra @@ -50,14 +52,17 @@ dependencies: - git+https://github.com/0cc4m/hf_bleeding_edge/ - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - einops - peft==0.3.0 - scipy - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/henk717/exllamav2/releases/download/0.4/exllamav2-0.0.4-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - windows-curses; sys_platform == 'win32' - pynvml - xformers==0.0.21 + - https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.0/flash_attn-2.3.0+cu118torch2.0cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' - omegaconf diff --git a/environments/ipex.yml b/environments/ipex.yml index 1d64bdf4..944b8fa2 100644 --- a/environments/ipex.yml +++ b/environments/ipex.yml @@ -24,19 +24,23 @@ dependencies: - psutil - ffmpeg - pip: - - -f https://developer.intel.com/ipex-whl-stable-xpu - - torch==2.0.1a0 - - intel_extension_for_pytorch==2.0.110+xpu + - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + - torch==2.0.1a0; sys_platform == 'linux' + - torch==2.0.0a0; sys_platform == 'win32' + - intel_extension_for_pytorch==2.0.110+xpu; sys_platform == 'linux' + - intel_extension_for_pytorch==2.0.110+gitba7f6c1; sys_platform == 'win32' + - intel-extension-for-transformers - flask-cloudflared==0.0.10 - flask-ngrok - flask-cors - lupa==1.10 - - transformers[sentencepiece]==4.33.1 + - transformers[sentencepiece]==4.34.0 - huggingface_hub==0.16.4 - - optimum[onnxruntime]==1.12.0 + - optimum[onnxruntime,openvino,nncf,neural-compressor]==1.13.2 - safetensors==0.3.3 - - accelerate==0.20.3 + - accelerate==0.21.0 - git+https://github.com/VE-FORBRYDERNE/mkultra + - flask-session - ansi2html - flask_compress - ijson @@ -44,8 +48,15 @@ dependencies: - pydub - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ + - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - einops - peft==0.3.0 + - scipy + - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' + - https://github.com/0cc4m/exllama/releases/download/0.0.7/exllama-0.0.7-cp38-cp38-win_amd64.whl; sys_platform == 'win32' - windows-curses; sys_platform == 'win32' - pynvml - omegaconf \ No newline at end of file diff --git a/environments/rocm.yml b/environments/rocm.yml index e1eeaab0..83f9a48e 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -30,9 +30,9 @@ dependencies: - flask-ngrok - flask-cors - lupa==1.10 - - transformers[sentencepiece]==4.33.1 + - transformers[sentencepiece]==4.34.0 - huggingface_hub==0.16.4 - - optimum[onnxruntime]==1.12.0 + - optimum[onnxruntime]==1.13.2 - safetensors==0.3.3 - accelerate==0.21.0 - git+https://github.com/VE-FORBRYDERNE/mkultra @@ -47,4 +47,5 @@ dependencies: - peft==0.3.0 - windows-curses; sys_platform == 'win32' - pynvml + - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+rocm5.4.2-cp38-cp38-linux_x86_64.whl - omegaconf \ No newline at end of file diff --git a/koboldai_settings.py b/koboldai_settings.py index 6a7ef81c..159031ea 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1401,6 +1401,8 @@ class system_settings(settings): bridge_data.horde_url = self._koboldai_var.horde_url bridge_data.api_key = self._koboldai_var.horde_api_key bridge_data.scribe_name = self._koboldai_var.horde_worker_name + bridge_data.max_length = self._koboldai_var.genamt + bridge_data.max_context_length = self._koboldai_var.max_length bridge_data.disable_terminal_ui = self._koboldai_var.host if bridge_data.worker_name == "My Awesome Instance": bridge_data.worker_name = f"KoboldAI UI Instance #{random.randint(-100000000, 100000000)}" diff --git a/modeling/inference_models/basic_hf/class.py b/modeling/inference_models/basic_hf/class.py index afca13ee..5ae2aa0d 100644 --- a/modeling/inference_models/basic_hf/class.py +++ b/modeling/inference_models/basic_hf/class.py @@ -148,6 +148,13 @@ class model_backend(InferenceModel): self.get_local_model_path(ignore_existance=True), ) + if not self.get_local_model_path(): + print(self.get_local_model_path()) + from huggingface_hub import snapshot_download + target_dir = "models/" + self.model_name.replace("/", "_") + print(self.model_name) + snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision) + self.init_model_config() self.model = AutoModelForCausalLM.from_pretrained( diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 4539b7a3..f688d611 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -128,6 +128,12 @@ class model_backend(InferenceModel): return config def _load(self, save_model: bool, initial_load: bool) -> None: + if not self.get_local_model_path(): + from huggingface_hub import snapshot_download + target_dir = "models/" + self.model_name.replace("/", "_") + print(self.model_name) + snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision) + self.model = self._get_model(self.get_local_model_path(), {}) self.tokenizer = self._get_tokenizer(self.get_local_model_path()) diff --git a/modeling/inference_models/exllamav2/class.py b/modeling/inference_models/exllamav2/class.py new file mode 100644 index 00000000..15b91c8d --- /dev/null +++ b/modeling/inference_models/exllamav2/class.py @@ -0,0 +1,422 @@ +from __future__ import annotations +try: + import time, json + import torch + import requests + import numpy as np + from typing import List, Optional, Union + import os + import glob + from pathlib import Path + import re + import warnings + import gc + + import utils + from logger import logger + + from modeling import warpers + from modeling.warpers import Warper + from modeling.stoppers import Stoppers + from modeling.post_token_hooks import PostTokenHooks + from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, + ModelCapabilities, + ) + + from modeling.tokenizer import GenericTokenizer + + + from exllamav2.model import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config + from transformers import LlamaTokenizer + from exllamav2.generator import ExLlamaV2StreamingGenerator + load_failed = False +except: + load_failed = True + +model_backend_type = "GPTQ" +model_backend_name = "ExLlama V2" + +# When set to true, messages will appear in the console if samplers are not +# changing the scores. Keep in mind some samplers don't always change the +# scores for each token. +LOG_SAMPLER_NO_EFFECT = False + +class model_backend(InferenceModel): + def __init__(self) -> None: + super().__init__() + self.model_config = None + + self.model = None + self.tokenizer = None + self.cache = None + self.generator = None + + self.model_name = "" + self.path = None + + self.post_token_hooks = [ + PostTokenHooks.stream_tokens, + ] + + self.stopper_hooks = [ + Stoppers.core_stopper, + Stoppers.dynamic_wi_scanner, + Stoppers.singleline_stopper, + Stoppers.chat_mode_stopper, + Stoppers.stop_sequence_stopper, + ] + + self.capabilties = ModelCapabilities( + embedding_manipulation=False, + post_token_hooks=True, + stopper_hooks=True, + post_token_probs=False, + ) + self.disable = load_failed + + def is_valid(self, model_name, model_path, menu_path): + try: + self.model_config = self._load_config(model_name, model_path) + #TODO check if model is valid + return True + except: + return False + + def get_local_model_path(self): + return self.path or os.path.join("models", self.model_name.replace("/", "_")) + + def _load_config(self, model_name, model_path): + config = ExLlamaV2Config() + if model_path is not None and os.path.exists(model_path): + config.model_dir = model_path + elif os.path.exists("models/{}".format(model_name.replace('/', '_'))): + config.model_dir = "models/{}".format(model_name.replace('/', '_')) + config.prepare() + + return config + + def _load(self, save_model: bool, initial_load: bool) -> None: + if not self.get_local_model_path(): + from huggingface_hub import snapshot_download + target_dir = "models/" + self.model_name.replace("/", "_") + print(self.model_name) + snapshot_download(self.model_name, local_dir=target_dir, local_dir_use_symlinks=False, cache_dir="cache/", revision=utils.koboldai_vars.revision) + self.model = self._get_model(self.get_local_model_path(), {}) + #TODO support GPU split + self.model.load(None) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + + self.cache = ExLlamaV2Cache(self.model) + + self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer.tokenizer) + + def _post_load(self) -> None: + # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer + self.tokenizer.add_bos_token = False + + # HF transformers no longer supports decode_with_prefix_space + # We work around this by wrapping decode, encode, and __call__ + # with versions that work around the 'prefix space' misfeature + # of sentencepiece. + vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size)) + has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")} + + # Wrap 'decode' with a method that always returns text starting with a space + # when the head token starts with a space. This is what 'decode_with_prefix_space' + # used to do, and we implement it using the same technique (building a cache of + # tokens that should have a prefix space, and then prepending a space if the first + # token is in this set.) We also work around a bizarre behavior in which decoding + # a single token 13 behaves differently than decoding a squence containing only [13]. + original_decode = type(self.tokenizer.tokenizer).decode + def decode_wrapper(self, token_ids, *args, **kwargs): + first = None + # Note, the code below that wraps single-value token_ids in a list + # is to work around this wonky behavior: + # >>> t.decode(13) + # '<0x0A>' + # >>> t.decode([13]) + # '\n' + # Not doing this causes token streaming to receive <0x0A> characters + # instead of newlines. + if isinstance(token_ids, int): + first = token_ids + token_ids = [first] + elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor + # Tensors don't support the Python standard of 'empty is False' + # and the special case of dimension 0 tensors also needs to be + # handled separately. + if token_ids.dim() == 0: + first = int(token_ids.item()) + token_ids = [first] + elif len(token_ids) > 0: + first = int(token_ids[0]) + elif token_ids is not None and len(token_ids) > 0: + first = token_ids[0] + result = original_decode(self, token_ids, *args, **kwargs) + if first is not None and first in has_prefix_space: + result = " " + result + return result + # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it + object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer)) + + # Wrap encode and __call__ to work around the 'prefix space' misfeature also. + # The problem is that "Bob" at the start of text is encoded as if it is + # " Bob". This creates a problem because it means you can't split text, encode + # the pieces, concatenate the tokens, decode them, and get the original text back. + # The workaround is to prepend a known token that (1) starts with a space; and + # (2) is not the prefix of any other token. After searching through the vocab + # " ," (space comma) is the only token containing only printable ascii characters + # that fits this bill. By prepending ',' to the text, the original encode + # method always returns [1919, ...], where the tail of the sequence is the + # actual encoded result we want without the prefix space behavior. + original_encode = type(self.tokenizer.tokenizer).encode + def encode_wrapper(self, text, *args, **kwargs): + if type(text) is str: + text = ',' + text + result = original_encode(self, text, *args, **kwargs) + result = result[1:] + else: + result = original_encode(self, text, *args, **kwargs) + return result + object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer)) + + # Since 'encode' is documented as being deprecated, also override __call__. + # This doesn't appear to currently be used by KoboldAI, but doing so + # in case someone uses it in the future. + original_call = type(self.tokenizer.tokenizer).__call__ + def call_wrapper(self, text, *args, **kwargs): + if type(text) is str: + text = ',' + text + result = original_call(self, text, *args, **kwargs) + result = result[1:] + else: + result = original_call(self, text, *args, **kwargs) + return result + object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer)) + + # Cache the newline token (for single line mode) + # Since there is only one Llama token containing newline, just encode \n + self.newline_tokens = self.tokenizer.encode("\n") + self.bracket_tokens = [i for i, tok in enumerate(vocab) if '[' in tok or ']' in tok] + self.tokenizer._koboldai_header = self.tokenizer.encode("") + + def unload(self): + #self.model_config = None # This breaks more than it fixes - Henk + + self.model = None + self.tokenizer = None + self.cache = None + self.generator = None + + self.model_name = "" + self.path = None + + with torch.no_grad(): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated") + for tensor in gc.get_objects(): + try: + if torch.is_tensor(tensor): + tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype)) + except: + pass + gc.collect() + try: + with torch.no_grad(): + torch.cuda.empty_cache() + except: + pass + + def _apply_warpers( + self, scores: torch.Tensor, input_ids: torch.Tensor + ) -> torch.Tensor: + warpers.update_settings() + + if LOG_SAMPLER_NO_EFFECT: + pre = torch.Tensor(scores) + + for sid in utils.koboldai_vars.sampler_order: + warper = Warper.from_id(sid) + + if not warper.value_is_valid(): + continue + + if warper == warpers.RepetitionPenalty: + # Rep pen needs more data than other samplers + scores = warper.torch(scores, input_ids=input_ids.cuda()) + else: + scores = warper.torch(scores) + + assert scores is not None, f"Scores are None; warper '{warper}' is to blame" + + if LOG_SAMPLER_NO_EFFECT: + if torch.equal(pre, scores): + logger.info(warper, "had no effect on the scores.") + pre = torch.Tensor(scores) + return scores + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + seed: Optional[int] = None, + **kwargs, + ) -> GenerationResult: + if seed: + torch.manual_seed(seed) + + bad_words_ids = [self.tokenizer.bos_token_id] + if utils.koboldai_vars.use_default_badwordsids: + bad_words_ids.append(self.tokenizer.eos_token_id) + bad_words_ids.extend(self.bracket_tokens) + if single_line: + bad_words_ids.extend(self.newline_tokens) + + if not isinstance(prompt_tokens, torch.Tensor): + gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] + else: + gen_in = prompt_tokens + + self.generator._gen_begin_reuse(gen_in, None) + + for i in range(max_new): + logits = self.model.forward(self.generator.sequence_ids[:, -1:], self.generator.cache) + for bad_word_id in bad_words_ids: + logits[:, :, bad_word_id] = -10000.0 + + logits = torch.unsqueeze(logits[0, -1, :], 0) + + scores = self._apply_warpers(logits, gen_in) + + scores = torch.softmax(scores, dim=-1) + + # Work around a bug in torch.multinomial (https://github.com/pytorch/pytorch/issues/48841) + # With low probability, multinomial can return an element with zero weight. Since this + # happens infrequently, just sample repeatedly until all tokens have non-zero probability. + for _ in range(100): + token = torch.multinomial(scores, 1) + # Verify that all selected tokens correspond to positive probabilities. + if (scores.gather(1, token) > 0).all(): + break + + if (token == self.tokenizer.eos_token_id).any(): + break + + if self.generator.sequence_ids is None: + self.generator.sequence_ids = token + else: + self.generator.sequence_ids = torch.cat([self.generator.sequence_ids, token.cpu()], dim=1) + + self._post_token_gen(self.generator.sequence_ids) + + utils.koboldai_vars.generated_tkns += 1 + + # Apply stoppers + do_stop = False + for stopper in self.stopper_hooks: + do_stop = stopper(self, self.generator.sequence_ids) + if do_stop: + break + if do_stop: + break + + seq = self.generator.sequence_ids[:, gen_in.size(1):] + + return GenerationResult( + model=self, + out_batches=np.array(seq,), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) + + def _get_model(self, location: str, tf_kwargs: Dict): + if not self.model_config: + self.model_config = ExLlamaV2Config() + self.model_config.model_dir = location + self.model_config.prepare() + + # self.model_config.gpu_peer_fix = True + return ExLlamaV2(self.model_config) + + def _get_tokenizer(self, location: str): + tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location)) + return tokenizer + + def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}): + requested_parameters = [] + gpu_count = torch.cuda.device_count() + layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None + + requested_parameters.append({ + "uitype": "slider", + "unit": "int", + "label": "Maximum Context", + "id": "max_ctx", + "min": 2048, + "max": 16384, + "step": 512, + "default": 2048, + "tooltip": "The maximum context size the model supports", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + + requested_parameters.append({ + "uitype": "slider", + "unit": "float", + "label": "Embedding Compression", + "id": "compress_emb", + "min": 1, + "max": 8, + "step": 0.25, + "default": 1, + "tooltip": "If the model requires compressed embeddings, set them here", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + + requested_parameters.append({ + "uitype": "slider", + "unit": "float", + "label": "NTK alpha", + "id": "ntk_alpha", + "min": 1, + "max": 32, + "step": 0.25, + "default": 1, + "tooltip": "NTK alpha value", + "menu_path": "Configuration", + "extra_classes": "", + "refresh_model_inputs": False + }) + + return requested_parameters + + def set_input_parameters(self, parameters): + gpu_count = torch.cuda.device_count() + + self.model_config.max_seq_len = parameters["max_ctx"] + self.model_config.compress_pos_emb = parameters["compress_emb"] + self.model_config.alpha_value = parameters["ntk_alpha"] + + # Disable half2 for HIP + self.model_config.rmsnorm_no_half2 = bool(torch.version.hip) + self.model_config.rope_no_half2 = bool(torch.version.hip) + self.model_config.matmul_no_half2 = bool(torch.version.hip) + self.model_config.silu_no_half2 = bool(torch.version.hip) + + # Disable scaled_dot_product_attention if torch version < 2 + if torch.__version__.startswith("1."): + self.model_config.sdp_thd = 0 + + self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id'] + self.path = parameters['path'] if 'path' in parameters else None \ No newline at end of file diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 3094dc33..62e89072 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -362,7 +362,7 @@ class model_backend(HFTorchInferenceModel): model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "opt": model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) - elif model_tseype == "mpt": + elif model_type == "mpt": model = load_quant_offload_device_map(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "gpt_bigcode": model = load_quant_offload_device_map(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias).half() diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 7e291b93..8cb52d69 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -232,7 +232,7 @@ class HFInferenceModel(InferenceModel): self.model_type = str(self.model_config.model_type) # These are model specific tokenizer overrides if a model has bad defaults - if self.model_type == "llama": + if self.model_type == "llama" or self.model_type == "mistral": # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer self.tokenizer.add_bos_token = False self.tokenizer.legacy = False diff --git a/modeling/ipex/__init__.py b/modeling/ipex/__init__.py index 9ec69012..43accd9f 100644 --- a/modeling/ipex/__init__.py +++ b/modeling/ipex/__init__.py @@ -16,7 +16,6 @@ def ipex_init(): # pylint: disable=too-many-statements torch.cuda.device = torch.xpu.device torch.cuda.device_count = torch.xpu.device_count torch.cuda.device_of = torch.xpu.device_of - torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard torch.cuda.get_device_name = torch.xpu.get_device_name torch.cuda.get_device_properties = torch.xpu.get_device_properties torch.cuda.init = torch.xpu.init @@ -145,7 +144,7 @@ def ipex_init(): # pylint: disable=too-many-statements ipex._C._DeviceProperties.minor = 2 #Fix functions with ipex: - torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory] + torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory] torch._utils._get_available_device_type = lambda: "xpu" torch.has_cuda = True torch.cuda.has_half = True @@ -157,6 +156,12 @@ def ipex_init(): # pylint: disable=too-many-statements torch.cuda.get_device_properties.minor = 7 torch.cuda.ipc_collect = lambda *args, **kwargs: None torch.cuda.utilization = lambda *args, **kwargs: 0 + if hasattr(torch.xpu, 'getDeviceIdListForCard'): + torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard + torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard + else: + torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card + torch.cuda.get_device_id_list_per_card = torch.xpu.get_device_id_list_per_card ipex_hijacks() attention_init() diff --git a/modeling/ipex/attention.py b/modeling/ipex/attention.py index d7335bfa..84848b6a 100644 --- a/modeling/ipex/attention.py +++ b/modeling/ipex/attention.py @@ -10,13 +10,15 @@ def torch_bmm(input, mat2, *, out=None): #ARC GPUs can't allocate more than 4GB to a single block, Slice it: batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2] - block_multiply = 2.4 if input.dtype == torch.float32 else 1.2 - block_size = (batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply #MB + block_multiply = input.element_size() + slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply + block_size = batch_size_attention * slice_block_size + split_slice_size = batch_size_attention - if block_size >= 4000: + if block_size > 4: do_split = True #Find something divisible with the input_tokens - while ((split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply) > 4000: + while (split_slice_size * slice_block_size) > 4: split_slice_size = split_slice_size // 2 if split_slice_size <= 1: split_slice_size = 1 @@ -24,12 +26,12 @@ def torch_bmm(input, mat2, *, out=None): else: do_split = False - split_block_size = (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply #MB split_2_slice_size = input_tokens - if split_block_size >= 4000: + if split_slice_size * slice_block_size > 4: + slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply do_split_2 = True #Find something divisible with the input_tokens - while ((split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply) > 4000: + while (split_2_slice_size * slice_block_size2) > 4: split_2_slice_size = split_2_slice_size // 2 if split_2_slice_size <= 1: split_2_slice_size = 1 @@ -64,14 +66,23 @@ def torch_bmm(input, mat2, *, out=None): original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False): #ARC GPUs can't allocate more than 4GB to a single block, Slice it: - shape_one, batch_size_attention, query_tokens, shape_four = query.shape - block_multiply = 2.4 if query.dtype == torch.float32 else 1.2 - block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB + if len(query.shape) == 3: + batch_size_attention, query_tokens, shape_four = query.shape + shape_one = 1 + no_shape_one = True + else: + shape_one, batch_size_attention, query_tokens, shape_four = query.shape + no_shape_one = False + + block_multiply = query.element_size() + slice_block_size = shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply + block_size = batch_size_attention * slice_block_size + split_slice_size = batch_size_attention - if block_size >= 4000: + if block_size > 4: do_split = True #Find something divisible with the shape_one - while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000: + while (split_slice_size * slice_block_size) > 4: split_slice_size = split_slice_size // 2 if split_slice_size <= 1: split_slice_size = 1 @@ -79,12 +90,12 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. else: do_split = False - split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB split_2_slice_size = query_tokens - if split_block_size >= 4000: + if split_slice_size * slice_block_size > 4: + slice_block_size2 = shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply do_split_2 = True #Find something divisible with the batch_size_attention - while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000: + while (split_2_slice_size * slice_block_size2) > 4: split_2_slice_size = split_2_slice_size // 2 if split_2_slice_size <= 1: split_2_slice_size = 1 @@ -101,21 +112,39 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name start_idx_2 = i2 * split_2_slice_size end_idx_2 = (i2 + 1) * split_2_slice_size - hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention( - query[:, start_idx:end_idx, start_idx_2:end_idx_2], - key[:, start_idx:end_idx, start_idx_2:end_idx_2], - value[:, start_idx:end_idx, start_idx_2:end_idx_2], - attn_mask=attn_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask, + if no_shape_one: + hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention( + query[start_idx:end_idx, start_idx_2:end_idx_2], + key[start_idx:end_idx, start_idx_2:end_idx_2], + value[start_idx:end_idx, start_idx_2:end_idx_2], + attn_mask=attn_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask, + dropout_p=dropout_p, is_causal=is_causal + ) + else: + hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention( + query[:, start_idx:end_idx, start_idx_2:end_idx_2], + key[:, start_idx:end_idx, start_idx_2:end_idx_2], + value[:, start_idx:end_idx, start_idx_2:end_idx_2], + attn_mask=attn_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask, + dropout_p=dropout_p, is_causal=is_causal + ) + else: + if no_shape_one: + hidden_states[start_idx:end_idx] = original_scaled_dot_product_attention( + query[start_idx:end_idx], + key[start_idx:end_idx], + value[start_idx:end_idx], + attn_mask=attn_mask[start_idx:end_idx] if attn_mask is not None else attn_mask, + dropout_p=dropout_p, is_causal=is_causal + ) + else: + hidden_states[:, start_idx:end_idx] = original_scaled_dot_product_attention( + query[:, start_idx:end_idx], + key[:, start_idx:end_idx], + value[:, start_idx:end_idx], + attn_mask=attn_mask[:, start_idx:end_idx] if attn_mask is not None else attn_mask, dropout_p=dropout_p, is_causal=is_causal ) - else: - hidden_states[:, start_idx:end_idx] = original_scaled_dot_product_attention( - query[:, start_idx:end_idx], - key[:, start_idx:end_idx], - value[:, start_idx:end_idx], - attn_mask=attn_mask[:, start_idx:end_idx] if attn_mask is not None else attn_mask, - dropout_p=dropout_p, is_causal=is_causal - ) else: return original_scaled_dot_product_attention( query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal diff --git a/modeling/ipex/diffusers.py b/modeling/ipex/diffusers.py index 4c39896e..005ee49f 100644 --- a/modeling/ipex/diffusers.py +++ b/modeling/ipex/diffusers.py @@ -55,13 +55,14 @@ class SlicedAttnProcessor: # pylint: disable=too-few-public-methods ) #ARC GPUs can't allocate more than 4GB to a single block, Slice it: - block_multiply = 2.4 if query.dtype == torch.float32 else 1.2 - block_size = (batch_size_attention * query_tokens * shape_three) / 1024 * block_multiply #MB + block_multiply = query.element_size() + slice_block_size = self.slice_size * shape_three / 1024 / 1024 * block_multiply + block_size = query_tokens * slice_block_size split_2_slice_size = query_tokens - if block_size >= 4000: + if block_size > 4: do_split_2 = True #Find something divisible with the query_tokens - while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000: + while (split_2_slice_size * slice_block_size) > 4: split_2_slice_size = split_2_slice_size // 2 if split_2_slice_size <= 1: split_2_slice_size = 1 diff --git a/requirements.txt b/requirements.txt index 39fb208b..2d17c5a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -transformers[sentencepiece]==4.33.1 +transformers[sentencepiece]==4.34.0 huggingface_hub==0.16.4 -optimum[onnxruntime]==1.12.0 +optimum[onnxruntime]==1.13.2 safetensors==0.3.3 Flask==2.3.3 Flask-SocketIO==5.3.2 @@ -41,10 +41,12 @@ git+https://github.com/0cc4m/hf_bleeding_edge/ einops peft==0.3.0 scipy -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10' -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10' -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8' -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.10' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp310-cp310-win_amd64.whl; sys_platform == 'win32' and python_version == '3.10' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux' and python_version == '3.8' +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.2/auto_gptq-0.4.2+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32' and python_version == '3.8' windows-curses; sys_platform == 'win32' pynvml -omegaconf +flash_attn==2.3.0 +xformers==0.0.21 +exllamav2==0.0.4omegaconf diff --git a/static/klite.html b/static/klite.html index cf30ee07..9a4bca92 100644 --- a/static/klite.html +++ b/static/klite.html @@ -3,7 +3,7 @@ +