Merge pull request #447 from 0cc4m/exllama

Merge exllama backend into united.
2025-06-05 21:59:24 +02:00 · 2023-09-10 17:45:06 +02:00
parent d66bb6bc6a fe53cb2f7e
commit 036db07dfb
3 changed files with 485 additions and 1 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -3929,7 +3929,8 @@ def generate(txt, minimum, maximum, found_entries=None, gen_mode=GenerationMode.
        return

    for i in range(koboldai_vars.numseqs):
-        koboldai_vars.lua_koboldbridge.generated[i+1][koboldai_vars.generated_tkns] = int(genout[i, -1].item())
+        if len(genout[i]) > 0:
+            koboldai_vars.lua_koboldbridge.generated[i+1][koboldai_vars.generated_tkns] = int(genout[i, -1].item())
        koboldai_vars.lua_koboldbridge.outputs[i+1] = utils.decodenewlines(tokenizer.decode(genout[i, -already_generated:]))

    execute_outmod()
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -54,5 +54,7 @@ dependencies:
    - einops
    - peft==0.3.0
    - scipy
+    - --find-links=https://0cc4m.github.io/exllama/exllama-whl-links.html
+    - exllama==0.0.6
    - windows-curses; sys_platform == 'win32'
    - pynvml
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -0,0 +1,481 @@
+from __future__ import annotations
+
+import time, json
+import torch
+import requests
+import numpy as np
+from typing import List, Optional, Union
+import os
+import glob
+from pathlib import Path
+import re
+import warnings
+import gc
+
+import utils
+from logger import logger
+
+from modeling import warpers
+from modeling.warpers import Warper
+from modeling.stoppers import Stoppers
+from modeling.post_token_hooks import PostTokenHooks
+from modeling.inference_model import (
+    GenerationResult,
+    GenerationSettings,
+    InferenceModel,
+    ModelCapabilities,
+)
+
+from modeling.tokenizer import GenericTokenizer
+
+from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
+from transformers import LlamaTokenizer
+from exllama.generator import ExLlamaGenerator
+
+model_backend_type = "GPTQ"
+model_backend_name = "ExLlama"
+
+# When set to true, messages will appear in the console if samplers are not
+# changing the scores. Keep in mind some samplers don't always change the
+# scores for each token.
+LOG_SAMPLER_NO_EFFECT = False
+
+
+def load_model_gptq_settings(path):
+    try:
+        js = json.load(open(path + "/config.json", "r"))
+    except Exception as e:
+        return False, False
+
+    gptq_model = False
+    gptq_file = False
+    gptq_in_config = False
+
+    try:
+        if js['quantization_config']['quant_method'] == "gptq":
+            gptq_in_config = True
+    except:
+        pass
+
+    gptq_legacy_files = glob.glob(os.path.join(path, "*4bit*.safetensors"))
+    if "gptq_bits" in js or gptq_in_config:
+        gptq_model = True
+        gptq_file = os.path.join(path, "model.safetensors")
+    elif gptq_legacy_files:
+        gptq_model = True
+        gptq_file = gptq_legacy_files[0]
+        fname = Path(gptq_file).parts[-1]
+        g = re.findall("(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+
+    return gptq_model, gptq_file
+
+
+class model_backend(InferenceModel):
+    def __init__(self) -> None:
+        super().__init__()
+        self.model_config = None
+
+        self.model = None
+        self.tokenizer = None
+        self.cache = None
+        self.generator = None
+
+        self.model_name = ""
+        self.path = None
+
+        self.post_token_hooks = [
+            PostTokenHooks.stream_tokens,
+        ]
+
+        self.stopper_hooks = [
+            Stoppers.core_stopper,
+            Stoppers.dynamic_wi_scanner,
+            Stoppers.singleline_stopper,
+            Stoppers.chat_mode_stopper,
+            Stoppers.stop_sequence_stopper,
+        ]
+
+        self.capabilties = ModelCapabilities(
+            embedding_manipulation=False,
+            post_token_hooks=True,
+            stopper_hooks=True,
+            post_token_probs=False,
+        )
+
+    def is_valid(self, model_name, model_path, menu_path):
+        gptq_model, _ = load_model_gptq_settings(model_path)
+        try:
+            self.model_config = self._load_config(model_name, model_path)
+            return self.model_config and gptq_model
+        except:
+            return False
+
+    def get_local_model_path(self):
+        return self.path or os.path.join("models", self.model_name.replace("/", "_"))
+
+    def _load_config(self, model_name, model_path):
+        config = False
+        if model_path is not None and os.path.exists(model_path):
+            config = ExLlamaConfig(os.path.join(model_path, "config.json"))
+        if not config and os.path.exists("models/{}".format(model_name.replace('/', '_'))):
+            config = ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json"))
+
+        return config
+
+    def _load(self, save_model: bool, initial_load: bool) -> None:
+        self.model = self._get_model(self.get_local_model_path(), {})
+        self.tokenizer = self._get_tokenizer(self.get_local_model_path())
+
+        self.cache = ExLlamaCache(self.model)
+
+        self.generator = ExLlamaGenerator(self.model, self.tokenizer.tokenizer, self.cache)
+
+    def _post_load(self) -> None:
+        # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
+        self.tokenizer.add_bos_token = False
+
+        # HF transformers no longer supports decode_with_prefix_space
+        # We work around this by wrapping decode, encode, and __call__
+        # with versions that work around the 'prefix space' misfeature
+        # of sentencepiece.
+        vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size))
+        has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")}
+
+        # Wrap 'decode' with a method that always returns text starting with a space
+        # when the head token starts with a space. This is what 'decode_with_prefix_space'
+        # used to do, and we implement it using the same technique (building a cache of
+        # tokens that should have a prefix space, and then prepending a space if the first
+        # token is in this set.) We also work around a bizarre behavior in which decoding
+        # a single token 13 behaves differently than decoding a squence containing only [13].
+        original_decode = type(self.tokenizer.tokenizer).decode
+        def decode_wrapper(self, token_ids, *args, **kwargs):
+            first = None
+            # Note, the code below that wraps single-value token_ids in a list
+            # is to work around this wonky behavior:
+            #   >>> t.decode(13)
+            #   '<0x0A>'
+            #   >>> t.decode([13])
+            #   '\n'
+            # Not doing this causes token streaming to receive <0x0A> characters
+            # instead of newlines.
+            if isinstance(token_ids, int):
+                first = token_ids
+                token_ids = [first]
+            elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor
+                # Tensors don't support the Python standard of 'empty is False'
+                # and the special case of dimension 0 tensors also needs to be
+                # handled separately.
+                if token_ids.dim() == 0:
+                    first = int(token_ids.item())
+                    token_ids = [first]
+                elif len(token_ids) > 0:
+                    first = int(token_ids[0])
+            elif token_ids is not None and len(token_ids) > 0:
+                first = token_ids[0]
+            result = original_decode(self, token_ids, *args, **kwargs)
+            if first is not None and first in has_prefix_space:
+                result = " " + result
+            return result
+        # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it
+        object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer))
+
+        # Wrap encode and __call__ to work around the 'prefix space' misfeature also.
+        # The problem is that "Bob" at the start of text is encoded as if it is
+        # " Bob". This creates a problem because it means you can't split text, encode
+        # the pieces, concatenate the tokens, decode them, and get the original text back.
+        # The workaround is to prepend a known token that (1) starts with a space; and
+        # (2) is not the prefix of any other token. After searching through the vocab
+        # " ," (space comma) is the only token containing only printable ascii characters
+        # that fits this bill. By prepending ',' to the text, the original encode
+        # method always returns [1919, ...], where the tail of the sequence is the
+        # actual encoded result we want without the prefix space behavior.
+        original_encode = type(self.tokenizer.tokenizer).encode
+        def encode_wrapper(self, text, *args, **kwargs):
+            if type(text) is str:
+                text = ',' + text
+                result = original_encode(self, text, *args, **kwargs)
+                result = result[1:]
+            else:
+                result = original_encode(self, text, *args, **kwargs)
+            return result
+        object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer))
+
+        # Since 'encode' is documented as being deprecated, also override __call__.
+        # This doesn't appear to currently be used by KoboldAI, but doing so
+        # in case someone uses it in the future.
+        original_call = type(self.tokenizer.tokenizer).__call__
+        def call_wrapper(self, text, *args, **kwargs):
+            if type(text) is str:
+                text = ',' + text
+                result = original_call(self, text, *args, **kwargs)
+                result = result[1:]
+            else:
+                result = original_call(self, text, *args, **kwargs)
+            return result
+        object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
+
+        # Cache the newline token (for single line mode)
+        # Since there is only one Llama token containing newline, just encode \n
+        self.newline_tokens = self.tokenizer.encode("\n")
+        self.bracket_tokens = [i for i, tok in enumerate(vocab) if '[' in tok or ']' in tok]
+        self.tokenizer._koboldai_header = self.tokenizer.encode("")
+
+    def unload(self):
+        self.model_config = None
+
+        self.model = None
+        self.tokenizer = None
+        self.cache = None
+        self.generator = None
+
+        self.model_name = ""
+        self.path = None
+
+        with torch.no_grad():
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated")
+                for tensor in gc.get_objects():
+                    try:
+                        if torch.is_tensor(tensor):
+                            tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype))
+                    except:
+                        pass
+        gc.collect()
+        try:
+            with torch.no_grad():
+                torch.cuda.empty_cache()
+        except:
+            pass
+
+    def _apply_warpers(
+        self, scores: torch.Tensor, input_ids: torch.Tensor
+    ) -> torch.Tensor:
+        warpers.update_settings()
+
+        if LOG_SAMPLER_NO_EFFECT:
+            pre = torch.Tensor(scores)
+
+        for sid in utils.koboldai_vars.sampler_order:
+            warper = Warper.from_id(sid)
+
+            if not warper.value_is_valid():
+                continue
+
+            if warper == warpers.RepetitionPenalty:
+                # Rep pen needs more data than other samplers
+                scores = warper.torch(scores, input_ids=input_ids)
+            else:
+                scores = warper.torch(scores)
+
+            assert scores is not None, f"Scores are None; warper '{warper}' is to blame"
+
+            if LOG_SAMPLER_NO_EFFECT:
+                if torch.equal(pre, scores):
+                    logger.info(warper, "had no effect on the scores.")
+                pre = torch.Tensor(scores)
+        return scores
+
+    def _raw_generate(
+        self,
+        prompt_tokens: Union[List[int], torch.Tensor],
+        max_new: int,
+        gen_settings: GenerationSettings,
+        single_line: bool = False,
+        batch_count: int = 1,
+        seed: Optional[int] = None,
+        **kwargs,
+    ) -> GenerationResult:
+        if seed:
+            torch.manual_seed(seed)
+
+        bad_words_ids = [self.tokenizer.bos_token_id]
+        if utils.koboldai_vars.use_default_badwordids:
+            bad_words_ids.append(self.tokenizer.eos_token_id)
+            bad_words_ids.extend(self.bracket_tokens)
+        if single_line:
+            bad_words_ids.extend(self.newline_tokens)
+
+        if not isinstance(prompt_tokens, torch.Tensor):
+            gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
+        else:
+            gen_in = prompt_tokens
+
+        self.generator.gen_begin_reuse(gen_in)
+
+        for i in range(max_new):
+            logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache)
+            for bad_word_id in bad_words_ids:
+                logits[:, :, bad_word_id] = -10000.0
+
+            logits = torch.unsqueeze(logits[0, -1, :], 0)
+
+            scores = self._apply_warpers(logits, gen_in)
+
+            scores = torch.softmax(scores, dim=-1)
+
+            # Work around a bug in torch.multinomial (https://github.com/pytorch/pytorch/issues/48841)
+            # With low probability, multinomial can return an element with zero weight. Since this
+            # happens infrequently, just sample repeatedly until all tokens have non-zero probability.
+            for _ in range(100):
+                token = torch.multinomial(scores, 1)
+                # Verify that all selected tokens correspond to positive probabilities.
+                if (scores.gather(1, token) > 0).all():
+                    break
+
+            if (token == self.tokenizer.eos_token_id).any():
+                break
+
+            self.generator.gen_accept_token(token)
+
+            self._post_token_gen(self.generator.sequence)
+
+            utils.koboldai_vars.generated_tkns += 1
+
+            # Apply stoppers
+            do_stop = False
+            for stopper in self.stopper_hooks:
+                do_stop = stopper(self, self.generator.sequence)
+                if do_stop:
+                    break
+            if do_stop:
+                break
+
+        seq = self.generator.sequence[:, gen_in.size(1):]
+
+        return GenerationResult(
+            model=self,
+            out_batches=np.array(seq,),
+            prompt=prompt_tokens,
+            is_whole_generation=True,
+            single_line=single_line,
+        )
+
+    def _get_model(self, location: str, tf_kwargs: Dict):
+        if not self.model_config:
+            ExLlamaConfig(os.path.join(location, "config.json"))
+
+        _, self.model_config.model_path = load_model_gptq_settings(location)
+        # self.model_config.gpu_peer_fix = True
+        return ExLlama(self.model_config)
+
+    def _get_tokenizer(self, location: str):
+        tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location))
+        return tokenizer
+
+    def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+        requested_parameters = []
+        gpu_count = torch.cuda.device_count()
+        layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
+        requested_parameters.append({
+                                        "uitype": "Valid Display",
+                                        "unit": "text",
+                                        "label": "Current Allocated Layers: %1/{}".format(layer_count), #%1 will be the validation value
+                                        "id": "valid_layers",
+                                        "max": layer_count,
+                                        "step": 1,
+                                        "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="},
+                                        "menu_path": "Layers",
+                                        "extra_classes": "",
+                                        "refresh_model_inputs": False
+                                    })
+        for i in range(gpu_count):
+            requested_parameters.append({
+                                            "uitype": "slider",
+                                            "unit": "int",
+                                            "label": "{} Layers".format(torch.cuda.get_device_name(i)),
+                                            "id": "{}_Layers".format(i),
+                                            "min": 0,
+                                            "max": layer_count,
+                                            "step": 1,
+                                            "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="},
+                                            "check_message": "The sum of assigned layers must equal {}".format(layer_count),
+                                            "default": [layer_count if i == 0 else 0],
+                                            "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)),
+                                            "menu_path": "Layers",
+                                            "extra_classes": "",
+                                            "refresh_model_inputs": False
+                                        })
+
+        requested_parameters.append({
+            "uitype": "slider",
+            "unit": "int",
+            "label": "Maximum Context",
+            "id": "max_ctx",
+            "min": 2048,
+            "max": 16384,
+            "step": 512,
+            "default": 2048,
+            "tooltip": "The maximum context size the model supports",
+            "menu_path": "Configuration",
+            "extra_classes": "",
+            "refresh_model_inputs": False
+        })
+
+        requested_parameters.append({
+            "uitype": "slider",
+            "unit": "float",
+            "label": "Embedding Compression",
+            "id": "compress_emb",
+            "min": 1,
+            "max": 8,
+            "step": 0.25,
+            "default": 1,
+            "tooltip": "If the model requires compressed embeddings, set them here",
+            "menu_path": "Configuration",
+            "extra_classes": "",
+            "refresh_model_inputs": False
+        })
+
+        requested_parameters.append({
+            "uitype": "slider",
+            "unit": "float",
+            "label": "NTK alpha",
+            "id": "ntk_alpha",
+            "min": 1,
+            "max": 32,
+            "step": 0.25,
+            "default": 1,
+            "tooltip": "NTK alpha value",
+            "menu_path": "Configuration",
+            "extra_classes": "",
+            "refresh_model_inputs": False
+        })
+
+        return requested_parameters
+
+    def set_input_parameters(self, parameters):
+        gpu_count = torch.cuda.device_count()
+        layers = []
+        for i in range(gpu_count):
+            if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric():
+                layers.append(int(parameters["{}_Layers".format(i)]))
+            elif isinstance(parameters["{}_Layers".format(i)], str):
+                 layers.append(None)
+            else:
+                layers.append(parameters["{}_Layers".format(i)])
+
+        self.layers = layers
+        self.model_config.device_map.layers = []
+        for i, l in enumerate(layers):
+            if l > 0:
+                self.model_config.device_map.layers.extend([f"cuda:{i}"] * l)
+        self.model_config.device_map.lm_head = "cuda:0"
+        self.model_config.device_map.norm = "cuda:0"
+
+        self.model_config.max_seq_len = parameters["max_ctx"]
+        self.model_config.compress_pos_emb = parameters["compress_emb"]
+        self.model_config.alpha_value = parameters["ntk_alpha"]
+        self.model_config.calculate_rotary_embedding_base()
+
+        # Disable half2 for HIP
+        self.model_config.rmsnorm_no_half2 = bool(torch.version.hip)
+        self.model_config.rope_no_half2 = bool(torch.version.hip)
+        self.model_config.matmul_no_half2 = bool(torch.version.hip)
+        self.model_config.silu_no_half2 = bool(torch.version.hip)
+
+        # Disable scaled_dot_product_attention if torch version < 2
+        if torch.__version__.startswith("1."):
+            self.model_config.sdp_thd = 0
+
+        self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
+        self.path = parameters['path'] if 'path' in parameters else None