Basic exllama plugin

2025-06-05 21:59:24 +02:00 · 2023-06-04 15:40:12 +02:00
parent c82625490a
commit b35f61e987
1 changed files with 277 additions and 0 deletions
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -0,0 +1,277 @@
 from __future__ import annotations
 import time, json
 import torch
 import requests
 import numpy as np
 from typing import List, Optional, Union
 import os
 import glob
 from pathlib import Path
 import re
 import utils
 from logger import logger
 from modeling.inference_model import (
    GenerationResult,
    GenerationSettings,
    InferenceModel,
    ModelCapabilities,
 )
 from modeling.tokenizer import GenericTokenizer
 from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
 from transformers import LlamaTokenizer
 from exllama.generator import ExLlamaGenerator
 import traceback
 model_backend_name = "ExLlama"
 def load_model_gptq_settings(path):
    try:
        js = json.load(open(path + "/config.json", "r"))
    except Exception as e:
        return False
    gptq_model = False
    gptq_file = False
    gptq_legacy_files = glob.glob(os.path.join(path, "4bit*.safetensors"))
    if "gptq_bits" in js:
        gptq_model = True
        gptq_file = os.path.join(path, "model.safetensors")
    elif gptq_legacy_files:
        gptq_model = True
        gptq_file = gptq_legacy_files[0]
        fname = Path(gptq_file).parts[-1]
        g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
    return gptq_model, gptq_file
 class model_backend(InferenceModel):
    def __init__(self) -> None:
        super().__init__()
        self.model_config = None
        self.model = None
        self.tokenizer = None
        self.model_name = None
        self.path = None
    def is_valid(self, model_name, model_path, menu_path):
        gptq_model, _ = load_model_gptq_settings(model_path)
        try:
            self.model_config = self._load_config(model_name, model_path)
            return self.model_config and gptq_model
        except:
            return False
    def get_local_model_path(self):
        return self.path or os.path.join("models", self.model_name.replace("/", "_"))
    def _load_config(self, model_name, model_path):
        if model_path is not None and os.path.exists(model_path):
            return ExLlamaConfig(os.path.join(model_path, "config.json"))
        if(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
            return ExLlamaConfig(os.path.join("models/{}".format(model_name.replace('/', '_')), "config.json"))
        return False
    def _load(self, save_model: bool, initial_load: bool) -> None:
        self.model = self._get_model(self.get_local_model_path(), {})
        self.tokenizer = self._get_tokenizer(os.path.join(self.get_local_model_path(), "tokenizer.model"))
        self.cache = ExLlamaCache(self.model)
        self.generator = ExLlamaGenerator(self.model, self.tokenizer.tokenizer, self.cache)
    def _post_load(self) -> None:
        # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
        self.tokenizer.add_bos_token = False
        # HF transformers no longer supports decode_with_prefix_space
        # We work around this by wrapping decode, encode, and __call__
        # with versions that work around the 'prefix space' misfeature
        # of sentencepiece.
        vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size))
        has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")}
        # Wrap 'decode' with a method that always returns text starting with a space
        # when the head token starts with a space. This is what 'decode_with_prefix_space'
        # used to do, and we implement it using the same technique (building a cache of
        # tokens that should have a prefix space, and then prepending a space if the first
        # token is in this set.) We also work around a bizarre behavior in which decoding
        # a single token 13 behaves differently than decoding a squence containing only [13].
        original_decode = type(self.tokenizer.tokenizer).decode
        def decode_wrapper(self, token_ids, *args, **kwargs):
            first = None
            # Note, the code below that wraps single-value token_ids in a list
            # is to work around this wonky behavior:
            #   >>> t.decode(13)
            #   '<0x0A>'
            #   >>> t.decode([13])
            #   '\n'
            # Not doing this causes token streaming to receive <0x0A> characters
            # instead of newlines.
            if isinstance(token_ids, int):
                first = token_ids
                token_ids = [first]
            elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor
                # Tensors don't support the Python standard of 'empty is False'
                # and the special case of dimension 0 tensors also needs to be
                # handled separately.
                if token_ids.dim() == 0:
                    first = int(token_ids.item())
                    token_ids = [first]
                elif len(token_ids) > 0:
                    first = int(token_ids[0])
            elif token_ids is not None and len(token_ids) > 0:
                first = token_ids[0]
            result = original_decode(self, token_ids, *args, **kwargs)
            if first is not None and first in has_prefix_space:
                result = " " + result
            return result
        # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it
        object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer))
        # Wrap encode and __call__ to work around the 'prefix space' misfeature also.
        # The problem is that "Bob" at the start of text is encoded as if it is
        # " Bob". This creates a problem because it means you can't split text, encode
        # the pieces, concatenate the tokens, decode them, and get the original text back.
        # The workaround is to prepend a known token that (1) starts with a space; and
        # (2) is not the prefix of any other token. After searching through the vocab
        # " ," (space comma) is the only token containing only printable ascii characters
        # that fits this bill. By prepending ',' to the text, the original encode
        # method always returns [1919, ...], where the tail of the sequence is the
        # actual encoded result we want without the prefix space behavior.
        original_encode = type(self.tokenizer.tokenizer).encode
        def encode_wrapper(self, text, *args, **kwargs):
            if type(text) is str:
                text = ',' + text
                result = original_encode(self, text, *args, **kwargs)
                result = result[1:]
            else:
                result = original_encode(self, text, *args, **kwargs)
            return result
        object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer))
        # Since 'encode' is documented as being deprecated, also override __call__.
        # This doesn't appear to currently be used by KoboldAI, but doing so
        # in case someone uses it in the future.
        original_call = type(self.tokenizer.tokenizer).__call__
        def call_wrapper(self, text, *args, **kwargs):
            if type(text) is str:
                text = ',' + text
                result = original_call(self, text, *args, **kwargs)
                result = result[1:]
            else:
                result = original_call(self, text, *args, **kwargs)
            return result
        object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
    def _raw_generate(
        self,
        prompt_tokens: Union[List[int], torch.Tensor],
        max_new: int,
        gen_settings: GenerationSettings,
        single_line: bool = False,
        batch_count: int = 1,
        seed: Optional[int] = None,
        **kwargs,
    ) -> GenerationResult:
        if not isinstance(prompt_tokens, torch.Tensor):
            gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
        else:
            gen_in = prompt_tokens
        self.generator.settings.temperature = max(gen_settings.temp, 0.01)
        self.generator.settings.top_k = gen_settings.top_k if gen_settings.top_k > 0 else 10000
        self.generator.settings.top_p = gen_settings.top_p
        self.generator.settings.min_p = 0.0
        self.generator.gen_begin(gen_in)
        for i in range(max_new):
            token = self.generator.gen_single_token()
            if token.item() == self.tokenizer.eos_token_id: break
        return GenerationResult(
            model=self,
            out_batches=np.array(
                self.generator.sequence[:, gen_in.size(1):],
            ),
            prompt=prompt_tokens,
            is_whole_generation=True,
            single_line=single_line,
        )
    def _get_model(self, location: str, tf_kwargs: Dict):
        _, self.model_config.model_path = load_model_gptq_settings(location)
        return ExLlama(self.model_config)
    def _get_tokenizer(self, location: str):
        tokenizer = GenericTokenizer(LlamaTokenizer.from_pretrained(location))
        tokenizer._koboldai_header = tokenizer.encode("")
        return tokenizer
    def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
        requested_parameters = []
        gpu_count = torch.cuda.device_count()
        layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
        requested_parameters.append({
                                        "uitype": "Valid Display",
                                        "unit": "text",
                                        "label": "Current Allocated Layers: %1/{}".format(layer_count), #%1 will be the validation value
                                        "id": "valid_layers",
                                        "max": layer_count,
                                        "step": 1,
                                        "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="},
                                        "menu_path": "Layers",
                                        "extra_classes": "",
                                        "refresh_model_inputs": False
                                    })
        for i in range(gpu_count):
            requested_parameters.append({
                                            "uitype": "slider",
                                            "unit": "int",
                                            "label": "{} Layers".format(torch.cuda.get_device_name(i)),
                                            "id": "{}_Layers".format(i),
                                            "min": 0,
                                            "max": layer_count,
                                            "step": 1,
                                            "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)], "value": layer_count, 'check': "="},
                                            "check_message": "The sum of assigned layers must equal {}".format(layer_count),
                                            "default": [layer_count if i == 0 else 0],
                                            "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)),
                                            "menu_path": "Layers",
                                            "extra_classes": "",
                                            "refresh_model_inputs": False
                                        })
        return requested_parameters
    def set_input_parameters(self, parameters):
        gpu_count = torch.cuda.device_count()
        layers = []
        for i in range(gpu_count):
            if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric():
                layers.append(int(parameters["{}_Layers".format(i)]))
            elif isinstance(parameters["{}_Layers".format(i)], str):
                 layers.append(None)
            else:
                layers.append(parameters["{}_Layers".format(i)])
        self.layers = layers
        for i, l in enumerate(layers):
            if l > 0:
                self.model_config.device_map.layers.extend([f"cuda:{i}"] * l)
        self.model_config.device_map.lm_head = "cuda:0"
        self.model_config.device_map.norm = "cuda:0"
        self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
        self.path = parameters['path'] if 'path' in parameters else None