import os, sys from typing import Optional try: from hf_bleeding_edge import AutoConfig except ImportError: from transformers import AutoConfig import warnings import utils import json import koboldai_settings from logger import logger from modeling.inference_model import InferenceModel import torch import gc class HFInferenceModel(InferenceModel): def __init__(self) -> None: super().__init__() self.model_config = None # TODO: model_name should probably be an instantiation parameter all the # way down the inheritance chain. self.model_name = None self.path = None self.hf_torch = False self.model = None self.tokenizer = None self.badwordsids = koboldai_settings.badwordsids_default self.usegpu = False def is_valid(self, model_name, model_path, menu_path): try: if model_path is not None and os.path.exists(model_path): self.model_config = AutoConfig.from_pretrained(model_path) elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))): self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache") else: self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache") return True except: return False def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}): requested_parameters = [] if not self.hf_torch: return [] if model_name in ('customhuggingface', 'customgptq'): requested_parameters.append({ "uitype": "text", "unit": "text", "label": "Huggingface Model Name", "id": "custom_model_name", "default": parameters["custom_model_name"] if "custom_model_name" in parameters and parameters["custom_model_name"] != "" else "", "check": {"value": "", 'check': "!="}, "tooltip": "Model name from https://huggingface.co/", "menu_path": "", "refresh_model_inputs": True, "extra_classes": "" }) if model_name not in ('customhuggingface', 'customgptq') or "custom_model_name" in parameters: model_name = parameters["custom_model_name"] if "custom_model_name" in parameters and parameters["custom_model_name"] != "" else model_name if model_path is not None and os.path.exists(model_path): self.model_config = AutoConfig.from_pretrained(model_path) elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))): self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache") else: self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache") layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None layer_count = None if hasattr(self, "get_model_type") and self.get_model_type() == "gpt2" else layer_count #Skip layers if we're a GPT2 model as it doesn't support breakmodel if layer_count is not None and layer_count >= 0 and not self.nobreakmodel: if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self): with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f: temp = json.load(f) break_values = temp['layers'] if 'layers' in temp else [layer_count] disk_blocks = temp['disk_layers'] if 'disk_layers' in temp else 0 else: break_values = [layer_count] disk_blocks = 0 break_values = [int(x) for x in break_values if x != '' and x is not None] gpu_count = torch.cuda.device_count() break_values += [0] * (gpu_count - len(break_values)) if disk_blocks is not None: break_values += [int(disk_blocks)] requested_parameters.append({ "uitype": "Valid Display", "unit": "text", "label": "Current Allocated Layers: %1/{}".format(layer_count), #%1 will be the validation value "id": "valid_layers", "max": layer_count, "step": 1, "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="}, "menu_path": "Layers", "extra_classes": "", "refresh_model_inputs": False }) for i in range(gpu_count): requested_parameters.append({ "uitype": "slider", "unit": "int", "label": "{} Layers".format(torch.cuda.get_device_name(i)), "id": "{}_Layers".format(i), "min": 0, "max": layer_count, "step": 1, "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="}, "check_message": "The sum of assigned layers must equal {}".format(layer_count), "default": break_values[i], "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)), "menu_path": "Layers", "extra_classes": "", "refresh_model_inputs": False }) requested_parameters.append({ "uitype": "slider", "unit": "int", "label": "CPU Layers", "id": "CPU_Layers", "min": 0, "max": layer_count, "step": 1, "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="}, "check_message": "The sum of assigned layers must equal {}".format(layer_count), "default": layer_count - sum(break_values), "tooltip": "The number of layers to put on the CPU. This will use your system RAM. It will also do inference partially on CPU. Use if you must.", "menu_path": "Layers", "extra_classes": "", "refresh_model_inputs": False }) if disk_blocks is not None: requested_parameters.append({ "uitype": "slider", "unit": "int", "label": "Disk Layers", "id": "Disk_Layers", "min": 0, "max": layer_count, "step": 1, "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="}, "check_message": "The sum of assigned layers must equal {}".format(layer_count), "default": disk_blocks, "tooltip": "The number of layers to put on the disk. This will use your hard drive. The is VERY slow in comparison to GPU or CPU. Use as a last resort.", "menu_path": "Layers", "extra_classes": "", "refresh_model_inputs": False }) else: requested_parameters.append({ "uitype": "toggle", "unit": "bool", "label": "Use GPU", "id": "use_gpu", "default": True, "tooltip": "Whether or not to use the GPU", "menu_path": "Layers", "extra_classes": "", "refresh_model_inputs": False }) return requested_parameters def set_input_parameters(self, parameters): if self.hf_torch and hasattr(self, "get_model_type") and self.get_model_type() != "gpt2": layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None if layer_count is not None and layer_count >= 0 and not self.nobreakmodel: gpu_count = torch.cuda.device_count() layers = [] for i in range(gpu_count): if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric(): layers.append(int(parameters["{}_Layers".format(i)])) elif isinstance(parameters["{}_Layers".format(i)], str): layers.append(None) else: layers.append(parameters["{}_Layers".format(i)]) self.cpu_layers = int(parameters['CPU_Layers']) if 'CPU_Layers' in parameters else None if isinstance(self.cpu_layers, str): self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0 self.layers = layers self.disk_layers = parameters['Disk_Layers'] if 'Disk_Layers' in parameters else 0 if isinstance(self.disk_layers, str): self.disk_layers = int(self.disk_layers) if self.disk_layers.isnumeric() else 0 print("TODO: Allow config") # self.usegpu = self.cpu_layers == 0 and breakmodel.disk_blocks == 0 and sum(self.layers)-self.layers[0] == 0 self.model_type = self.get_model_type() self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel self.lazy_load = True logger.debug("Model type: {}".format(self.model_type)) else: logger.debug("Disabling breakmodel and lazyload") self.usegpu = parameters['use_gpu'] if 'use_gpu' in parameters else None self.breakmodel = False self.lazy_load = False logger.info(parameters) self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id'] self.path = parameters['path'] if 'path' in parameters else None def unload(self): if hasattr(self, 'model'): self.model = None if hasattr(self, 'tokenizer'): self.tokenizer = None if hasattr(self, 'model_config'): self.model_config = None with torch.no_grad(): with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated") for tensor in gc.get_objects(): try: if torch.is_tensor(tensor): tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype)) except: pass gc.collect() try: with torch.no_grad(): torch.cuda.empty_cache() except: pass def _pre_load(self) -> None: # HACK: Make model instantiation work without UI parameters self.model_name = self.model_name or utils.koboldai_vars.model return super()._pre_load() def _post_load(self) -> None: self.badwordsids = koboldai_settings.badwordsids_default self.model_type = str(self.model_config.model_type) # These are model specific tokenizer overrides if a model has bad defaults if self.model_type == "llama" or self.model_type == "mistral": # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer self.tokenizer.add_bos_token = False self.tokenizer.legacy = False # HF transformers no longer supports decode_with_prefix_space # We work around this by wrapping decode, encode, and __call__ # with versions that work around the 'prefix space' misfeature # of sentencepiece. vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size)) has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")} # Wrap 'decode' with a method that always returns text starting with a space # when the head token starts with a space. This is what 'decode_with_prefix_space' # used to do, and we implement it using the same technique (building a cache of # tokens that should have a prefix space, and then prepending a space if the first # token is in this set.) We also work around a bizarre behavior in which decoding # a single token 13 behaves differently than decoding a squence containing only [13]. original_decode = type(self.tokenizer.tokenizer).decode def decode_wrapper(self, token_ids, *args, **kwargs): first = None # Note, the code below that wraps single-value token_ids in a list # is to work around this wonky behavior: # >>> t.decode(13) # '<0x0A>' # >>> t.decode([13]) # '\n' # Not doing this causes token streaming to receive <0x0A> characters # instead of newlines. if isinstance(token_ids, int): first = token_ids token_ids = [first] elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor # Tensors don't support the Python standard of 'empty is False' # and the special case of dimension 0 tensors also needs to be # handled separately. if token_ids.dim() == 0: first = int(token_ids.item()) token_ids = [first] elif len(token_ids) > 0: first = int(token_ids[0]) elif token_ids is not None and len(token_ids) > 0: first = token_ids[0] result = original_decode(self, token_ids, *args, **kwargs) if first is not None and first in has_prefix_space: result = " " + result return result # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer)) # Wrap encode and __call__ to work around the 'prefix space' misfeature also. # The problem is that "Bob" at the start of text is encoded as if it is # " Bob". This creates a problem because it means you can't split text, encode # the pieces, concatenate the tokens, decode them, and get the original text back. # The workaround is to prepend a known token that (1) starts with a space; and # (2) is not the prefix of any other token. After searching through the vocab # " ," (space comma) is the only token containing only printable ascii characters # that fits this bill. By prepending ',' to the text, the original encode # method always returns [1919, ...], where the tail of the sequence is the # actual encoded result we want without the prefix space behavior. original_encode = type(self.tokenizer.tokenizer).encode def encode_wrapper(self, text, *args, **kwargs): if type(text) is str: text = ',' + text result = original_encode(self, text, *args, **kwargs) result = result[1:] else: result = original_encode(self, text, *args, **kwargs) return result object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer)) # Since 'encode' is documented as being deprecated, also override __call__. # This doesn't appear to currently be used by KoboldAI, but doing so # in case someone uses it in the future. original_call = type(self.tokenizer.tokenizer).__call__ def call_wrapper(self, text, *args, **kwargs): if type(text) is str: text = ',' + text result = original_call(self, text, *args, **kwargs) result = result[1:] else: result = original_call(self, text, *args, **kwargs) return result object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer)) elif self.model_type == "opt": self.tokenizer._koboldai_header = self.tokenizer.encode("") self.tokenizer.add_bos_token = False self.tokenizer.add_prefix_space = False # Change newline behavior to match model quirks if self.model_type == "xglm": # Default to newline mode if using XGLM utils.koboldai_vars.newlinemode = "s" elif self.model_type in ["opt", "bloom"]: # Handle but don't convert newlines if using Fairseq models that have newlines trained in them utils.koboldai_vars.newlinemode = "ns" # Clean up tokens that cause issues if ( self.badwordsids == koboldai_settings.badwordsids_default and self.model_type not in ("gpt2", "gpt_neo", "gptj") ): self.badwordsids = [ [v] for k, v in self.tokenizer.get_vocab().items() if any(c in str(k) for c in "[]") ] try: self.badwordsids.remove([self.tokenizer.pad_token_id]) except: pass if utils.koboldai_vars.newlinemode == "n": self.badwordsids.append([self.tokenizer.eos_token_id]) return super()._post_load() def get_local_model_path( self, legacy: bool = False, ignore_existance: bool = False ) -> Optional[str]: """ Returns a string of the model's path locally, or None if it is not downloaded. If ignore_existance is true, it will always return a path. """ if self.path is not None: if os.path.exists(self.path): return self.path if self.model_name in ["NeoCustom", "GPT2Custom", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]: model_path = self.path assert model_path # Path can be absolute or relative to models directory if os.path.exists(model_path): return model_path model_path = os.path.join("models", model_path) try: assert os.path.exists(model_path) except AssertionError: logger.error(f"Custom model does not exist at '{utils.koboldai_vars.custmodpth}' or '{model_path}'.") raise return model_path basename = self.model_name.replace("/", "_") if legacy: ret = basename else: ret = os.path.join("models", basename) if os.path.isdir(ret) or ignore_existance: return ret return None def init_model_config(self) -> None: # Get the model_type from the config or assume a model type if it isn't present try: self.model_config = AutoConfig.from_pretrained( self.get_local_model_path() or self.model_name, revision=utils.koboldai_vars.revision, cache_dir="cache", ) self.model_type = self.model_config.model_type if "gptq_bits" in dir(self.model_config): self.gptq_model = True self.gptq_bits = self.model_config.gptq_bits self.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1 self.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1 self.gptq_file = None else: self.gptq_model = False except ValueError: self.model_type = { "NeoCustom": "gpt_neo", "GPT2Custom": "gpt2", }.get(self.model) if not self.model_type: logger.warning( "No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)" ) self.model_type = "gpt_neo"