diff --git a/aiserver.py b/aiserver.py index 2fd605e3..bebd233f 100644 --- a/aiserver.py +++ b/aiserver.py @@ -534,8 +534,15 @@ koboldai_vars = koboldai_settings.koboldai_vars(socketio) utils.koboldai_vars = koboldai_vars utils.socketio = socketio -# HACK: Weird import position to steal koboldai_vars from utils -from model import APIInferenceModel, GenericHFTorchInferenceModel, CustomGPT2HFTorchInferenceModel, HFMTJInferenceModel, HordeInferenceModel, OpenAIAPIInferenceModel, patch_transformers +# Weird import position to steal koboldai_vars from utils +from modeling.patches import patch_transformers +from modeling.inference_models.api import APIInferenceModel +from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel +from modeling.inference_models.legacy_gpt2_hf import CustomGPT2HFTorchInferenceModel +from modeling.inference_models.hf_mtj import HFMTJInferenceModel +from modeling.inference_models.horde import HordeInferenceModel +from modeling.inference_models.openai import OpenAIAPIInferenceModel + old_socketio_on = socketio.on def new_socketio_on(*a, **k): diff --git a/logger.py b/logger.py index 8da2aa7e..c0b8b8b0 100644 --- a/logger.py +++ b/logger.py @@ -2,6 +2,18 @@ import sys from functools import partialmethod from loguru import logger +# Yes this shouldn't be here but I couldn't really find a better place to put +# it barring creating a whole file just for this which is rather silly +class Colors: + PURPLE = "\033[95m" + BLUE = "\033[94m" + CYAN = "\033[96m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + RED = "\033[91m" + END = "\033[0m" + UNDERLINE = "\033[4m" + STDOUT_LEVELS = ["GENERATION", "PROMPT"] INIT_LEVELS = ["INIT", "INIT_OK", "INIT_WARN", "INIT_ERR"] MESSAGE_LEVELS = ["MESSAGE"] diff --git a/model.py b/model.py deleted file mode 100644 index 37741585..00000000 --- a/model.py +++ /dev/null @@ -1,2908 +0,0 @@ -# Before merge: -# - Fix Lua -# - Check if probabilities work -# - Fix any TODOB4MERGE comments -from __future__ import annotations - -import bisect -import copy -import requests -from dataclasses import dataclass -from eventlet import tpool -import gc -import shutil -import contextlib -import functools -import itertools -import json -import os -import time -import traceback -from typing import Dict, Iterable, List, Optional, Tuple, Union -import zipfile -from tqdm.auto import tqdm -from logger import logger -import torch_lazy_loader - -import warpers -from warpers import Warper - -import torch -from torch.nn import Embedding -import numpy as np -import transformers -from transformers import ( - StoppingCriteria, - GPT2Tokenizer, - GPT2LMHeadModel, - GPTNeoForCausalLM, - AutoModelForCausalLM, - AutoTokenizer, - PreTrainedModel, - modeling_utils, - AutoConfig, - LogitsProcessorList, - LogitsProcessor, -) - -import utils -import koboldai_settings - -try: - import breakmodel - import accelerate.utils -except ModuleNotFoundError as e: - if not utils.koboldai_vars.use_colab_tpu: - raise e - -HACK_currentmodel = None - -try: - import tpu_mtj_backend -except ModuleNotFoundError as e: - # Not on TPU... hopefully - if utils.koboldai_vars.use_colab_tpu: - raise e - -# HACK: Tttttttterrrible structure hack -class colors: - PURPLE = "\033[95m" - BLUE = "\033[94m" - CYAN = "\033[96m" - GREEN = "\033[92m" - YELLOW = "\033[93m" - RED = "\033[91m" - END = "\033[0m" - UNDERLINE = "\033[4m" - - -class OpenAIAPIError(Exception): - def __init__(self, error_type: str, error_message) -> None: - super().__init__(f"{error_type}: {error_message}") - - -class HordeException(Exception): - """To be used for errors on server side of the Horde.""" - - pass - - -class ColabException(Exception): - """To be used for errors when using the Colab API as an interface.""" - - pass - - -class APIException(Exception): - """To be used for errors when using the Kobold API as an interface.""" - - pass - - -class GenerationSettings: - def __init__(self, **overrides) -> None: - for setting in [ - "temp", - "top_p", - "top_k", - "tfs", - "typical", - "top_a", - "rep_pen", - "rep_pen_slope", - "rep_pen_range", - "sampler_order", - ]: - setattr( - self, - setting, - overrides.get(setting, getattr(utils.koboldai_vars, setting)), - ) - - -class Stoppers: - @staticmethod - def core_stopper( - model: InferenceModel, - input_ids: torch.LongTensor, - ) -> bool: - if not utils.koboldai_vars.inference_config.do_core: - return False - - utils.koboldai_vars.generated_tkns += 1 - - if ( - not utils.koboldai_vars.standalone - and utils.koboldai_vars.lua_koboldbridge.generated_cols - and utils.koboldai_vars.generated_tkns - != utils.koboldai_vars.lua_koboldbridge.generated_cols - ): - raise RuntimeError( - f"Inconsistency detected between KoboldAI Python and Lua backends ({utils.koboldai_vars.generated_tkns} != {utils.koboldai_vars.lua_koboldbridge.generated_cols})" - ) - - if utils.koboldai_vars.abort or ( - utils.koboldai_vars.inference_config.stop_at_genamt - and utils.koboldai_vars.generated_tkns >= utils.koboldai_vars.genamt - ): - utils.koboldai_vars.abort = False - model.gen_state["regeneration_required"] = False - model.gen_state["halt"] = False - return True - - if utils.koboldai_vars.standalone: - return False - - assert input_ids.ndim == 2 - - model.gen_state[ - "regeneration_required" - ] = utils.koboldai_vars.lua_koboldbridge.regeneration_required - model.gen_state["halt"] = not utils.koboldai_vars.lua_koboldbridge.generating - utils.koboldai_vars.lua_koboldbridge.regeneration_required = False - - for i in ( - range(utils.koboldai_vars.numseqs) - if not utils.koboldai_vars.alt_multi_gen - else range(1) - ): - utils.koboldai_vars.lua_koboldbridge.generated[i + 1][ - utils.koboldai_vars.generated_tkns - ] = int(input_ids[i, -1].item()) - - return model.gen_state["regeneration_required"] or model.gen_state["halt"] - - @staticmethod - def dynamic_wi_scanner( - model: InferenceModel, - input_ids: torch.LongTensor, - ) -> bool: - if not utils.koboldai_vars.inference_config.do_dynamic_wi: - return False - - if not utils.koboldai_vars.dynamicscan: - return False - - if len(model.gen_state["wi_scanner_excluded_keys"]) != input_ids.shape[0]: - model.gen_state["wi_scanner_excluded_keys"] - print(model.tokenizer.decode(model.gen_state["wi_scanner_excluded_keys"])) - print(model.tokenizer.decode(input_ids.shape[0])) - - assert len(model.gen_state["wi_scanner_excluded_keys"]) == input_ids.shape[0] - - tail = input_ids[..., -utils.koboldai_vars.generated_tkns :] - for i, t in enumerate(tail): - decoded = utils.decodenewlines(model.tokenizer.decode(t)) - _, _, _, found = utils.koboldai_vars.calc_ai_text( - submitted_text=decoded, send_context=False - ) - found = list( - set(found) - set(model.gen_state["wi_scanner_excluded_keys"][i]) - ) - if found: - print("FOUNDWI", found) - return True - return False - - @staticmethod - def chat_mode_stopper( - model: InferenceModel, - input_ids: torch.LongTensor, - ) -> bool: - if not utils.koboldai_vars.chatmode: - return False - - data = [model.tokenizer.decode(x) for x in input_ids] - # null_character = model.tokenizer.encode(chr(0))[0] - if "completed" not in model.gen_state: - model.gen_state["completed"] = [False] * len(input_ids) - - for i in range(len(input_ids)): - if ( - data[i][-1 * (len(utils.koboldai_vars.chatname) + 1) :] - == utils.koboldai_vars.chatname + ":" - ): - model.gen_state["completed"][i] = True - if all(model.gen_state["completed"]): - utils.koboldai_vars.generated_tkns = utils.koboldai_vars.genamt - del model.gen_state["completed"] - return True - return False - - -class PostTokenHooks: - @staticmethod - def stream_tokens( - model: InferenceModel, - input_ids: torch.LongTensor, - ) -> None: - if not model.gen_state["do_streaming"]: - return - - if not utils.koboldai_vars.output_streaming: - return - - data = [ - utils.applyoutputformatting( - utils.decodenewlines(model.tokenizer.decode(x[-1])), - no_sentence_trimming=True, - no_single_line=True, - ) - for x in input_ids - ] - utils.koboldai_vars.actions.stream_tokens(data) - - -# We only want to use logit manipulations and such on our core text model -class use_core_manipulations: - # These must be set by wherever they get setup - get_logits_processor: callable = None - sample: callable = None - get_stopping_criteria: callable = None - - # We set these automatically - old_get_logits_processor: callable = None - old_sample: callable = None - old_get_stopping_criteria: callable = None - - def __enter__(self): - if use_core_manipulations.get_logits_processor: - use_core_manipulations.old_get_logits_processor = ( - transformers.GenerationMixin._get_logits_processor - ) - transformers.GenerationMixin._get_logits_processor = ( - use_core_manipulations.get_logits_processor - ) - - if use_core_manipulations.sample: - use_core_manipulations.old_sample = transformers.GenerationMixin.sample - transformers.GenerationMixin.sample = use_core_manipulations.sample - - if use_core_manipulations.get_stopping_criteria: - use_core_manipulations.old_get_stopping_criteria = ( - transformers.GenerationMixin._get_stopping_criteria - ) - transformers.GenerationMixin._get_stopping_criteria = ( - use_core_manipulations.get_stopping_criteria - ) - return self - - def __exit__(self, exc_type, exc_value, exc_traceback): - if use_core_manipulations.old_get_logits_processor: - transformers.GenerationMixin._get_logits_processor = ( - use_core_manipulations.old_get_logits_processor - ) - else: - assert ( - not use_core_manipulations.get_logits_processor - ), "Patch leak: THE MONKEYS HAVE ESCAPED" - - if use_core_manipulations.old_sample: - transformers.GenerationMixin.sample = use_core_manipulations.old_sample - else: - assert ( - not use_core_manipulations.sample - ), "Patch leak: THE MONKEYS HAVE ESCAPED" - - if use_core_manipulations.old_get_stopping_criteria: - transformers.GenerationMixin._get_stopping_criteria = ( - use_core_manipulations.old_get_stopping_criteria - ) - else: - assert ( - not use_core_manipulations.get_stopping_criteria - ), "Patch leak: THE MONKEYS HAVE ESCAPED" - - -def patch_transformers_download(): - def http_get( - url: str, - temp_file, - proxies=None, - resume_size=0, - headers=None, - file_name=None, - ): - """ - Download remote file. Do not gobble up errors. - """ - headers = copy.deepcopy(headers) - if resume_size > 0: - headers["Range"] = f"bytes={resume_size}-" - r = requests.get(url, stream=True, proxies=proxies, headers=headers) - transformers.utils.hub._raise_for_status(r) - content_length = r.headers.get("Content-Length") - total = ( - resume_size + int(content_length) if content_length is not None else None - ) - - # `tqdm` behavior is determined by `utils.logging.is_progress_bar_enabled()` - # and can be set using `utils.logging.enable/disable_progress_bar()` - if url[-11:] != "config.json": - progress = tqdm.tqdm( - unit="B", - unit_scale=True, - unit_divisor=1024, - total=total, - initial=resume_size, - desc=f"Downloading {file_name}" - if file_name is not None - else "Downloading", - file=utils.UIProgressBarFile(), - ) - utils.koboldai_vars.status_message = "Download Model" - utils.koboldai_vars.total_download_chunks = total - - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - if url[-11:] != "config.json": - progress.update(len(chunk)) - utils.koboldai_vars.downloaded_chunks += len(chunk) - temp_file.write(chunk) - - if url[-11:] != "config.json": - progress.close() - - utils.koboldai_vars.status_message = "" - - transformers.utils.hub.http_get = http_get - - -def patch_transformers_loader() -> None: - """ - Patch the Transformers loader to use aria2 and our shard tracking. - Universal for TPU/MTJ and Torch. - """ - old_from_pretrained = PreTrainedModel.from_pretrained.__func__ - - @classmethod - def new_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - utils.koboldai_vars.fp32_model = False - utils.num_shards = None - utils.current_shard = 0 - utils.from_pretrained_model_name = pretrained_model_name_or_path - utils.from_pretrained_index_filename = None - utils.from_pretrained_kwargs = kwargs - utils.bar = None - if not utils.args.no_aria2: - utils.aria2_hook(pretrained_model_name_or_path, **kwargs) - return old_from_pretrained( - cls, pretrained_model_name_or_path, *model_args, **kwargs - ) - - if not hasattr(PreTrainedModel, "_kai_patched"): - PreTrainedModel.from_pretrained = new_from_pretrained - PreTrainedModel._kai_patched = True - - if hasattr(modeling_utils, "get_checkpoint_shard_files"): - old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files - - def new_get_checkpoint_shard_files( - pretrained_model_name_or_path, index_filename, *args, **kwargs - ): - utils.num_shards = utils.get_num_shards(index_filename) - utils.from_pretrained_index_filename = index_filename - return old_get_checkpoint_shard_files( - pretrained_model_name_or_path, index_filename, *args, **kwargs - ) - - modeling_utils.get_checkpoint_shard_files = new_get_checkpoint_shard_files - - -def patch_transformers_generation() -> None: - # Not sure why this global is needed... - global transformers - - # Patch transformers to use our custom logit warpers -- Only HFTorchInferenceModel uses this - def dynamic_processor_wrap(cls, field_name, var_name, cond=None): - old_call = cls.__call__ - - def new_call(self, *args, **kwargs): - if not isinstance(field_name, str) and isinstance(field_name, Iterable): - conds = [] - for f, v in zip(field_name, var_name): - conds.append(getattr(utils.koboldai_vars, v)) - setattr(self, f, conds[-1]) - else: - conds = getattr(utils.koboldai_vars, var_name) - setattr(self, field_name, conds) - assert len(args) == 2 - if cond is None or cond(conds): - return old_call(self, *args, **kwargs) - return args[1] - - cls.__call__ = new_call - - # Allow bad words filter to ban <|endoftext|> token - import transformers.generation.logits_process - - def new_init(self, bad_words_ids: List[List[int]], eos_token_id: int): - return new_init.old_init(self, bad_words_ids, -1) - - new_init.old_init = ( - transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ - ) - transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ = new_init - - -def patch_transformers() -> None: - patch_transformers_download() - patch_transformers_loader() - - # Doesn't do anything for TPU - patch_transformers_generation() - - -class GenerationResult: - """A container for easily accessing different forms of model outputs. Returned by most generate functions.""" - - def __init__( - self, - model: InferenceModel, - out_batches: list, - prompt: list, - # Controls if generate() does it's looping thing. This should only be - # done for HF models that use that StoppingCondition - is_whole_generation: bool, - # Controls if we should trim output by prompt length - output_includes_prompt: bool = False, - # Lazy filter to cut off extra lines where we can't manipulate - # probabilities - single_line: bool = False, - ): - # Shave prompt off of encoded response when needed (HF). Decoded does - # not return prompt. - if output_includes_prompt: - self.encoded = out_batches[:, len(prompt) :] - else: - self.encoded = out_batches - - self.prompt = prompt - self.is_whole_generation = is_whole_generation - - self.decoded = [ - utils.decodenewlines(model.tokenizer.decode(enc)) for enc in self.encoded - ] - - if single_line: - self.decoded = [x.split("\n", 1)[0] for x in self.decoded] - self.encoded = np.array(model.tokenizer(self.decoded).input_ids) - - -@dataclass -class ModelCapabilities: - embedding_manipulation: bool = False - post_token_hooks: bool = False - stopper_hooks: bool = False - # TODO: Support non-live probabilities from APIs - post_token_probs: bool = False - - -class InferenceModel: - """Root class for all models.""" - - def __init__(self) -> None: - self.gen_state = {} - self.post_token_hooks = [] - self.stopper_hooks = [] - self.tokenizer = None - self.capabilties = ModelCapabilities() - - def load(self, save_model: bool = False, initial_load: bool = False) -> None: - """User-facing load function. Do not override this; try `_load()` instead.""" - - self._load(save_model=save_model, initial_load=initial_load) - self._post_load() - - global HACK_currentmodel - HACK_currentmodel = self - - print(self.raw_generate("Hi guys,", 20).__dict__) - - def _post_load(self) -> None: - """Post load hook. Called after `_load()`.""" - pass - - def _load(self, save_model: bool, initial_load: bool) -> None: - """Main load method. All logic related to loading the model onto the - selected device(s) and preparing it for inference should be implemented here.""" - raise NotImplementedError - - def _get_tokenizer(self, location: str) -> AutoTokenizer: - """Returns the appropiate tokenizer for the location. Should be ran once and result stored in `tokenizer`. - - Args: - location (str): Either a local model directory path or a HuggingFace model ID. - - Returns: - AutoTokenizer: Tokenizer deemed fit for the location string. May be a fallback tokenizer. - """ - if utils.koboldai_vars.model_type == "xglm": - # Default to newline mode if using XGLM - utils.koboldai_vars.newlinemode = "s" - elif utils.koboldai_vars.model_type in ["opt", "bloom"]: - # Handle but don't convert newlines if using Fairseq models that have newlines trained in them - utils.koboldai_vars.newlinemode = "ns" - - std_kwargs = {"revision": utils.koboldai_vars.revision, "cache_dir": "cache"} - - suppliers = [ - # Fast tokenizer disabled by default as per HF docs: - # > Note: Make sure to pass use_fast=False when loading - # OPT’s tokenizer with AutoTokenizer to get the correct - # tokenizer. - lambda: AutoTokenizer.from_pretrained( - location, use_fast=False, **std_kwargs - ), - lambda: AutoTokenizer.from_pretrained(location, **std_kwargs), - # Fallback to GPT2Tokenizer - lambda: GPT2Tokenizer.from_pretrained(location, **std_kwargs), - lambda: GPT2Tokenizer.from_pretrained("gpt2", **std_kwargs), - ] - - for i, try_get_tokenizer in enumerate(suppliers): - try: - return try_get_tokenizer() - except: - # If we error on each attempt, raise the last one - if i == len(suppliers) - 1: - raise - - def core_generate( - self, - text: list, - found_entries: set, - ): - """Generate story text. Heavily tied to story-specific parameters; if - you are making a new generation-based feature, consider `generate_raw()`. - - Args: - text (list): Encoded input tokens - found_entries (set): Entries found for Dynamic WI - - Raises: - RuntimeError: if inconsistancies are detected with the internal state and Lua state -- sanity check - RuntimeError: if inconsistancies are detected with the internal state and core stopper -- sanity check - """ - - start_time = time.time() - gen_in = torch.tensor(text, dtype=torch.long)[None] - logger.debug( - "core_generate: torch.tensor time {}s".format(time.time() - start_time) - ) - - start_time = time.time() - if utils.koboldai_vars.is_model_torch(): - # Torch stuff - if utils.koboldai_vars.full_determinism: - torch.manual_seed(utils.koboldai_vars.seed) - - if utils.koboldai_vars.sp is not None: - assert self.capabilties.embedding_manipulation - soft_tokens = torch.arange( - self.model.config.vocab_size, - self.model.config.vocab_size + utils.koboldai_vars.sp.shape[0], - ) - gen_in = torch.cat((soft_tokens[None], gen_in), dim=-1) - elif utils.koboldai_vars.use_colab_tpu: - if utils.koboldai_vars.full_determinism: - tpu_mtj_backend.set_rng_seed(utils.koboldai_vars.seed) - - logger.debug( - "core_generate: Model Setup (SP, etc) time {}s".format( - time.time() - start_time - ) - ) - - if ( - gen_in.shape[-1] + utils.koboldai_vars.genamt - > utils.koboldai_vars.max_length - ): - logger.error("gen_in.shape[-1]: {}".format(gen_in.shape[-1])) - logger.error( - "utils.koboldai_vars.genamt: {}".format(utils.koboldai_vars.genamt) - ) - logger.error( - "utils.koboldai_vars.max_length: {}".format( - utils.koboldai_vars.max_length - ) - ) - assert ( - gen_in.shape[-1] + utils.koboldai_vars.genamt - <= utils.koboldai_vars.max_length - ) - - start_time = time.time() - gen_in = gen_in.to(utils.get_auxilary_device()) - - logger.debug( - "core_generate: gen_in to device time {}s".format(time.time() - start_time) - ) - start_time = time.time() - - found_entries = found_entries or set() - - self.gen_state["wi_scanner_excluded_keys"] = found_entries - - utils.koboldai_vars._prompt = utils.koboldai_vars.prompt - - with torch.no_grad(): - already_generated = 0 - numseqs = utils.koboldai_vars.numseqs - total_gens = None - - for i in range( - utils.koboldai_vars.numseqs if utils.koboldai_vars.alt_multi_gen else 1 - ): - while True: - # The reason this is a loop is due to how Dynamic WI works. We - # cannot simply add the WI to the context mid-generation, so we - # stop early, and then insert WI, then continue generating. That - # stopping and continuing is this loop. - - start_time = time.time() - result = self.raw_generate( - gen_in[0], - max_new=utils.koboldai_vars.genamt, - do_streaming=utils.koboldai_vars.output_streaming, - do_dynamic_wi=utils.koboldai_vars.dynamicscan, - batch_count=numseqs - if not utils.koboldai_vars.alt_multi_gen - else 1, - # Real max length is handled by CoreStopper. - bypass_hf_maxlength=utils.koboldai_vars.dynamicscan, - is_core=True, - ) - logger.debug( - "core_generate: run raw_generate pass {} {}s".format( - already_generated, time.time() - start_time - ) - ) - - genout = result.encoded - - already_generated += len(genout[0]) - - try: - assert ( - already_generated - <= utils.koboldai_vars.genamt * utils.koboldai_vars.numseqs - if utils.koboldai_vars.alt_multi_gen - else 1 - ) - except AssertionError: - print("AlreadyGenerated", already_generated) - print("genamt", utils.koboldai_vars.genamt) - raise - - if result.is_whole_generation: - break - - # Generation stopped; why? - # If we have been told to halt, we have reached our target token - # amount (controlled by halt), or Dynamic WI has not told us to - # stop temporarily to insert WI, we can assume that we are done - # generating. We shall break. - if ( - self.gen_state["halt"] - or not self.gen_state["regeneration_required"] - ): - break - - # Now we are doing stuff for Dynamic WI. - assert genout.ndim >= 2 - assert genout.shape[0] == utils.koboldai_vars.numseqs - - if ( - utils.koboldai_vars.lua_koboldbridge.generated_cols - and utils.koboldai_vars.generated_tkns - != utils.koboldai_vars.lua_koboldbridge.generated_cols - ): - raise RuntimeError( - f"Inconsistency detected between KoboldAI Python and Lua backends ({utils.koboldai_vars.generated_tkns} != {utils.koboldai_vars.lua_koboldbridge.generated_cols})" - ) - - if already_generated != utils.koboldai_vars.generated_tkns: - print("already_generated: {}".format(already_generated)) - print( - "generated_tkns: {}".format( - utils.koboldai_vars.generated_tkns - ) - ) - raise RuntimeError("WI scanning error") - - for r in range(utils.koboldai_vars.numseqs): - for c in range(already_generated): - assert ( - utils.koboldai_vars.lua_koboldbridge.generated[r + 1][ - c + 1 - ] - is not None - ) - genout[r][ - genout.shape[-1] - already_generated + c - ] = utils.koboldai_vars.lua_koboldbridge.generated[r + 1][ - c + 1 - ] - - encoded = [] - - for i in range(utils.koboldai_vars.numseqs): - txt = utils.decodenewlines( - self.tokenizer.decode(genout[i, -already_generated:]) - ) - # winfo, mem, anotetxt, _found_entries = calcsubmitbudgetheader(txt, force_use_txt=True, actions=utils.koboldai_vars.actions) - # txt, _, _ = calcsubmitbudget(len(utils.koboldai_vars.actions), winfo, mem, anotetxt, utils.koboldai_vars.actions, submission=txt) - txt, _, _, _found_entries = utils.koboldai_vars.calc_ai_text( - submitted_text=txt, send_context=False - ) - found_entries[i].update(_found_entries) - encoded.append( - torch.tensor(txt, dtype=torch.long, device=genout.device) - ) - - max_length = len(max(encoded, key=len)) - encoded = torch.stack( - tuple( - torch.nn.functional.pad( - e, - (max_length - len(e), 0), - value=self.model.config.pad_token_id - or self.model.config.eos_token_id, - ) - for e in encoded - ) - ) - genout = torch.cat( - ( - encoded, - genout[..., -already_generated:], - ), - dim=-1, - ) - - if utils.koboldai_vars.sp is not None: - soft_tokens = torch.arange( - self.model.config.vocab_size, - self.model.config.vocab_size - + utils.koboldai_vars.sp.shape[0], - device=genout.device, - ) - genout = torch.cat( - (soft_tokens.tile(utils.koboldai_vars.numseqs, 1), genout), - dim=-1, - ) - - assert ( - genout.shape[-1] - + utils.koboldai_vars.genamt - - already_generated - <= utils.koboldai_vars.max_length - ) - gen_in = genout - numseqs = 1 - if total_gens is None: - total_gens = genout - else: - total_gens = torch.cat((total_gens, genout)) - - return total_gens, already_generated - - def _raw_generate( - self, - prompt_tokens: Union[List[int], torch.Tensor], - max_new: int, - gen_settings: GenerationSettings, - single_line: bool = False, - batch_count: int = 1, - ) -> GenerationResult: - """Lowest level model-agnostic generation function. To be overridden by model implementation. - - Args: - prompt_tokens (Union[List[int], torch.Tensor]): Prompt as encoded token IDs - max_new (int): Maximum amount of new tokens to generate - gen_settings (GenerationSettings): State to pass in single-generation setting overrides - single_line (bool, optional): Generate one line only. Defaults to False. - batch_count (int, optional): How big of a batch to generate. Defaults to 1. - - Returns: - GenerationResult: The model's output - """ - raise NotImplementedError - - def raw_generate( - self, - # prompt is either a string (text) or a list (token ids) - prompt: Union[str, list, np.ndarray], - max_new: int, - do_streaming: bool = False, - do_dynamic_wi: bool = False, - batch_count: int = 1, - bypass_hf_maxlength: bool = False, - generation_settings: Optional[dict] = None, - is_core: bool = False, - single_line: bool = False, - found_entries: set = (), - ) -> GenerationResult: - """A wrapper around `_raw_generate()` that handles gen_state and other stuff. Use this to generate text outside of the story. - - Args: - prompt (Union[str, list, np.ndarray]): The prompt as a string or encoded token IDs - max_new (int): Maximum amount of new tokens to generate - do_streaming (bool, optional): Whether to stream tokens to the user or not. Defaults to False. - do_dynamic_wi (bool, optional): Whether to use Dynamic WI context injections. Defaults to False. - batch_count (int, optional): How big of a batch to generate. Defaults to 1. - bypass_hf_maxlength (bool, optional): Whether to ignore model-provided max length limits. Defaults to False. - generation_settings (GenerationSettings): State to pass in single-generation setting overrides. Defaults to None - is_core (bool, optional): Whether this generation is a core story generation. Defaults to False. - single_line (bool, optional): Generate one line only.. Defaults to False. - found_entries (set, optional): Entries found for Dynamic WI. Defaults to (). - - Raises: - ValueError: If prompt type is weird - NotImplementedError: If model is ReadOnly - - Returns: - GenerationResult: The model's output - """ - # TODO: Support singleline outside of torch - - self.gen_state["do_streaming"] = do_streaming - self.gen_state["do_dynamic_wi"] = do_dynamic_wi - - # Dynamic WI depends on this!!! This is a main gen call. - self.gen_state["stop_at_genamt"] = do_dynamic_wi - - # Makes stopping criteria hook happy - self.gen_state["wi_scanner_excluded_keys"] = self.gen_state.get( - "wi_scanner_excluded_keys", set() - ) - - utils.koboldai_vars.inference_config.do_core = is_core - gen_settings = GenerationSettings(*(generation_settings or {})) - - if isinstance(prompt, torch.Tensor): - prompt_tokens = prompt.cpu().numpy() - elif isinstance(prompt, list): - prompt_tokens = np.array(prompt) - elif isinstance(prompt, str): - prompt_tokens = np.array(self.tokenizer.encode(prompt)) - else: - raise ValueError(f"Prompt is {type(prompt)}. Not a fan!") - - assert isinstance(prompt_tokens, np.ndarray) - assert len(prompt_tokens.shape) == 1 - - if utils.koboldai_vars.model == "ReadOnly": - raise NotImplementedError("No loaded model") - - time_start = time.time() - - with use_core_manipulations(): - result = self._raw_generate( - prompt_tokens=prompt_tokens, - max_new=max_new, - batch_count=batch_count, - gen_settings=gen_settings, - single_line=single_line, - ) - - time_end = round(time.time() - time_start, 2) - tokens_per_second = round(len(result.encoded[0]) / time_end, 2) - - if not utils.koboldai_vars.quiet: - logger.info( - f"Generated {len(result.encoded[0])} tokens in {time_end} seconds, for an average rate of {tokens_per_second} tokens per second." - ) - - return result - - def generate( - self, - prompt_tokens: Union[List[int], torch.Tensor], - max_new_tokens: int, - do_streaming: bool = False, - do_dynamic_wi: bool = False, - single_line: bool = False, - batch_count: int = 1, - ) -> torch.Tensor: - raise NotImplementedError - - def _post_token_gen(self, input_ids: torch.LongTensor) -> None: - for hook in self.post_token_hooks: - hook(self, input_ids) - - -class HFInferenceModel(InferenceModel): - def __init__(self) -> None: - super().__init__() - self.model_config = None - - def get_local_model_path( - self, legacy: bool = False, ignore_existance: bool = False - ) -> Optional[str]: - """ - Returns a string of the model's path locally, or None if it is not downloaded. - If ignore_existance is true, it will always return a path. - """ - - basename = utils.koboldai_vars.model.replace("/", "_") - if legacy: - ret = basename - else: - ret = os.path.join("models", basename) - - if os.path.isdir(ret) or ignore_existance: - return ret - return None - - def init_model_config(self) -> None: - # Get the model_type from the config or assume a model type if it isn't present - try: - self.model_config = AutoConfig.from_pretrained( - self.get_local_model_path() or utils.koboldai_vars.model, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - ) - utils.koboldai_vars.model_type = self.model_config.model_type - except ValueError: - utils.koboldai_vars.model_type = { - "NeoCustom": "gpt_neo", - "GPT2Custom": "gpt2", - }.get(utils.koboldai_vars.model) - - if not utils.koboldai_vars.model_type: - logger.warning( - "No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)" - ) - utils.koboldai_vars.model_type = "gpt_neo" - - -class HFMTJInferenceModel(HFInferenceModel): - def __init__( - self, - model_name: str, - ) -> None: - super().__init__() - - self.model_name = model_name - - self.model = None - self.tokenizer = None - self.model_config = None - self.capabilties = ModelCapabilities( - embedding_manipulation=False, - post_token_hooks=False, - stopper_hooks=False, - post_token_probs=False, - ) - - def setup_mtj(self) -> None: - def mtj_warper_callback(scores) -> "np.array": - scores_shape = scores.shape - scores_list = scores.tolist() - utils.koboldai_vars.lua_koboldbridge.logits = ( - utils.koboldai_vars.lua_state.table() - ) - for r, row in enumerate(scores_list): - utils.koboldai_vars.lua_koboldbridge.logits[ - r + 1 - ] = utils.koboldai_vars.lua_state.table(*row) - utils.koboldai_vars.lua_koboldbridge.vocab_size = scores_shape[-1] - - utils.koboldai_vars.lua_koboldbridge.execute_genmod() - - scores = np.array( - tuple( - tuple(row.values()) - for row in utils.koboldai_vars.lua_koboldbridge.logits.values() - ), - dtype=scores.dtype, - ) - assert scores.shape == scores_shape - - return scores - - def mtj_stopping_callback( - generated, n_generated, excluded_world_info - ) -> Tuple[List[set], bool, bool]: - utils.koboldai_vars.generated_tkns += 1 - - assert len(excluded_world_info) == len(generated) - regeneration_required = ( - utils.koboldai_vars.lua_koboldbridge.regeneration_required - ) - halt = ( - utils.koboldai_vars.abort - or not utils.koboldai_vars.lua_koboldbridge.generating - or utils.koboldai_vars.generated_tkns >= utils.koboldai_vars.genamt - ) - utils.koboldai_vars.lua_koboldbridge.regeneration_required = False - - global past - - for i in range(utils.koboldai_vars.numseqs): - utils.koboldai_vars.lua_koboldbridge.generated[i + 1][ - utils.koboldai_vars.generated_tkns - ] = int( - generated[i, tpu_mtj_backend.params["seq"] + n_generated - 1].item() - ) - - if not utils.koboldai_vars.dynamicscan or halt: - return excluded_world_info, regeneration_required, halt - - for i, t in enumerate(generated): - decoded = utils.decodenewlines( - self.tokenizer.decode(past[i]) - ) + utils.decodenewlines( - self.tokenizer.decode( - t[ - tpu_mtj_backend.params["seq"] : tpu_mtj_backend.params[ - "seq" - ] - + n_generated - ] - ) - ) - # _, found = checkworldinfo(decoded, force_use_txt=True, actions=koboldai_vars.actions) - _, _, _, found = utils.koboldai_vars.calc_ai_text( - submitted_text=decoded - ) - found -= excluded_world_info[i] - if len(found) != 0: - regeneration_required = True - break - return excluded_world_info, regeneration_required, halt - - def mtj_compiling_callback() -> None: - print(colors.GREEN + "TPU backend compilation triggered" + colors.END) - utils.koboldai_vars.compiling = True - - def mtj_stopped_compiling_callback() -> None: - print(colors.GREEN + "TPU backend compilation stopped" + colors.END) - utils.koboldai_vars.compiling = False - - def mtj_settings_callback() -> dict: - sampler_order = utils.koboldai_vars.sampler_order[:] - if ( - len(sampler_order) < 7 - ): # Add repetition penalty at beginning if it's not present - sampler_order = [6] + sampler_order - return { - "sampler_order": utils.koboldai_vars.sampler_order, - "top_p": float(utils.koboldai_vars.top_p), - "temp": float(utils.koboldai_vars.temp), - "top_k": int(utils.koboldai_vars.top_k), - "tfs": float(utils.koboldai_vars.tfs), - "typical": float(utils.koboldai_vars.typical), - "top_a": float(utils.koboldai_vars.top_a), - "repetition_penalty": float(utils.koboldai_vars.rep_pen), - "rpslope": float(utils.koboldai_vars.rep_pen_slope), - "rprange": int(utils.koboldai_vars.rep_pen_range), - } - - tpu_mtj_backend.socketio = utils.socketio - - if utils.koboldai_vars.model == "TPUMeshTransformerGPTNeoX": - utils.koboldai_vars.badwordsids = utils.koboldai_vars.badwordsids_neox - - print( - "{0}Initializing Mesh Transformer JAX, please wait...{1}".format( - colors.PURPLE, colors.END - ) - ) - if utils.koboldai_vars.model in ( - "TPUMeshTransformerGPTJ", - "TPUMeshTransformerGPTNeoX", - ) and ( - not utils.koboldai_vars.custmodpth - or not os.path.isdir(utils.koboldai_vars.custmodpth) - ): - raise FileNotFoundError( - f"The specified model path {repr(utils.koboldai_vars.custmodpth)} is not the path to a valid folder" - ) - if utils.koboldai_vars.model == "TPUMeshTransformerGPTNeoX": - tpu_mtj_backend.pad_token_id = 2 - - tpu_mtj_backend.koboldai_vars = utils.koboldai_vars - tpu_mtj_backend.warper_callback = mtj_warper_callback - tpu_mtj_backend.stopping_callback = mtj_stopping_callback - tpu_mtj_backend.compiling_callback = mtj_compiling_callback - tpu_mtj_backend.stopped_compiling_callback = mtj_stopped_compiling_callback - tpu_mtj_backend.settings_callback = mtj_settings_callback - - def _load(self, save_model: bool, initial_load: bool) -> None: - self.setup_mtj() - self.init_model_config() - utils.koboldai_vars.allowsp = True - - tpu_mtj_backend.load_model( - utils.koboldai_vars.model, - hf_checkpoint=utils.koboldai_vars.model - not in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") - and utils.koboldai_vars.use_colab_tpu, - socketio_queue=koboldai_settings.queue, - initial_load=initial_load, - logger=logger, - **self.model_config.to_dict() - ) - - utils.koboldai_vars.modeldim = int( - tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"]) - ) - - self.tokenizer = tpu_mtj_backend.tokenizer - if ( - utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default - and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj") - ): - utils.koboldai_vars.badwordsids = [ - [v] - for k, v in self.tokenizer.get_vocab().items() - if any(c in str(k) for c in "<>[]") - if utils.koboldai_vars.newlinemode != "s" or str(k) != "" - ] - - def get_soft_tokens(self) -> np.array: - soft_tokens = None - - if utils.koboldai_vars.sp is None: - tensor = np.zeros( - ( - 1, - tpu_mtj_backend.params.get( - "d_embed", tpu_mtj_backend.params["d_model"] - ), - ), - dtype=np.float32, - ) - rows = tensor.shape[0] - padding_amount = ( - tpu_mtj_backend.params["seq"] - - ( - tpu_mtj_backend.params["seq"] - % -tpu_mtj_backend.params["cores_per_replica"] - ) - - rows - ) - tensor = np.pad(tensor, ((0, padding_amount), (0, 0))) - tensor = tensor.reshape( - tpu_mtj_backend.params["cores_per_replica"], - -1, - tpu_mtj_backend.params.get( - "d_embed", tpu_mtj_backend.params["d_model"] - ), - ) - utils.koboldai_vars.sp = tpu_mtj_backend.shard_xmap(tensor) - - soft_tokens = np.arange( - tpu_mtj_backend.params["n_vocab"] - + tpu_mtj_backend.params["n_vocab_padding"], - tpu_mtj_backend.params["n_vocab"] - + tpu_mtj_backend.params["n_vocab_padding"] - + utils.koboldai_vars.sp_length, - dtype=np.uint32, - ) - return soft_tokens - - def _raw_generate( - self, - prompt_tokens: Union[List[int], torch.Tensor], - max_new: int, - gen_settings: GenerationSettings, - single_line: bool = False, - batch_count: int = 1, - ) -> GenerationResult: - soft_tokens = self.get_soft_tokens() - - genout = tpool.execute( - tpu_mtj_backend.infer_static, - np.uint32(prompt_tokens), - gen_len=max_new, - temp=gen_settings.temp, - top_p=gen_settings.top_p, - top_k=gen_settings.top_k, - tfs=gen_settings.tfs, - typical=gen_settings.typical, - top_a=gen_settings.top_a, - numseqs=batch_count, - repetition_penalty=gen_settings.rep_pen, - rpslope=gen_settings.rep_pen_slope, - rprange=gen_settings.rep_pen_range, - soft_embeddings=utils.koboldai_vars.sp, - soft_tokens=soft_tokens, - sampler_order=gen_settings.sampler_order, - ) - genout = np.array(genout) - - return GenerationResult( - self, - out_batches=genout, - prompt=prompt_tokens, - is_whole_generation=True, - single_line=single_line, - ) - - -class HFTorchInferenceModel(HFInferenceModel): - def __init__( - self, - model_name: str, - lazy_load: bool, - low_mem: bool, - ) -> None: - super().__init__() - - self.model_name = model_name - self.lazy_load = lazy_load - self.low_mem = low_mem - - self.post_token_hooks = [ - Stoppers.core_stopper, - PostTokenHooks.stream_tokens, - Stoppers.dynamic_wi_scanner, - Stoppers.chat_mode_stopper, - ] - - self.model = None - self.tokenizer = None - self.capabilties = ModelCapabilities( - embedding_manipulation=True, - post_token_hooks=True, - stopper_hooks=True, - post_token_probs=True, - ) - self._old_stopping_criteria = None - - def _apply_warpers( - self, scores: torch.Tensor, input_ids: torch.Tensor - ) -> torch.Tensor: - warpers.update_settings() - for sid in utils.koboldai_vars.sampler_order: - warper = Warper.from_id(sid) - if warper == warpers.RepetitionPenalty: - # Rep pen needs more data than other samplers - scores = warper.torch(scores, input_ids=input_ids) - else: - scores = warper.torch(scores) - return scores - - def _post_load(self) -> None: - # Patch stopping_criteria - - class PTHStopper(StoppingCriteria): - def __call__( - hf_self, - input_ids: torch.LongTensor, - scores: torch.FloatTensor, - ) -> None: - self._post_token_gen(input_ids) - - for stopper in self.stopper_hooks: - do_stop = stopper(input_ids) - if do_stop: - return True - return False - - old_gsc = transformers.GenerationMixin._get_stopping_criteria - - def _get_stopping_criteria( - hf_self, - *args, - **kwargs, - ): - stopping_criteria = old_gsc(hf_self, *args, **kwargs) - stopping_criteria.insert(0, PTHStopper()) - return stopping_criteria - - use_core_manipulations.get_stopping_criteria = _get_stopping_criteria - - # Patch logitswarpers - - class PhraseBiasLogitsProcessor(LogitsProcessor): - def __init__(self): - pass - - def _find_intersection(self, big: List, small: List) -> int: - """Find the maximum overlap between the beginning of small and the end of big. - Return the index of the token in small following the overlap, or 0. - - big: The tokens in the context (as a tensor) - small: The tokens in the phrase to bias (as a list) - - Both big and small are in "oldest to newest" order. - """ - # There are asymptotically more efficient methods for determining the overlap, - # but typically there will be few (0-1) instances of small[0] in the last len(small) - # elements of big, plus small will typically be fairly short. So this naive - # approach is acceptable despite O(N^2) worst case performance. - - num_small = len(small) - # The small list can only ever match against at most num_small tokens of big, - # so create a slice. Typically, this slice will be as long as small, but it - # may be shorter if the story has just started. - # We need to convert the big slice to list, since natively big is a tensor - # and tensor and list don't ever compare equal. It's better to convert here - # and then use native equality tests than to iterate repeatedly later. - big_slice = list(big[-num_small:]) - - # It's possible that the start token appears multiple times in small - # For example, consider the phrase: - # [ fair is foul, and foul is fair, hover through the fog and filthy air] - # If we merely look for the first instance of [ fair], then we would - # generate the following output: - # " fair is foul, and foul is fair is foul, and foul is fair..." - start = small[0] - for i, t in enumerate(big_slice): - # Strictly unnecessary, but it's marginally faster to test the first - # token before creating slices to test for a full match. - if t == start: - remaining = len(big_slice) - i - if big_slice[i:] == small[:remaining]: - # We found a match. If the small phrase has any remaining tokens - # then return the index of the next token. - if remaining < num_small: - return remaining - # In this case, the entire small phrase matched, so start over. - return 0 - - # There were no matches, so just begin at the beginning. - return 0 - - def _allow_leftwards_tampering(self, phrase: str) -> bool: - """Determines if a phrase should be tampered with from the left in - the "soft" token encoding mode.""" - - if phrase[0] in [".", "?", "!", ";", ":", "\n"]: - return False - return True - - def _get_token_sequence(self, phrase: str) -> List[List]: - """Convert the phrase string into a list of encoded biases, each - one being a list of tokens. How this is done is determined by the - phrase's format: - - - If the phrase is surrounded by square brackets ([]), the tokens - will be the phrase split by commas (,). If a "token" isn't - actually a number, it will be skipped. NOTE: Tokens output by - this may not be in the model's vocabulary, and such tokens - should be ignored later in the pipeline. - - If the phrase is surrounded by curly brackets ({}), the phrase - will be directly encoded with no synonym biases and no fancy - tricks. - - Otherwise, the phrase will be encoded, with close deviations - being included as synonym biases. - """ - - # TODO: Cache these tokens, invalidate when model or bias is - # changed. - - # Handle direct token id input - if phrase.startswith("[") and phrase.endswith("]"): - no_brackets = phrase[1:-1] - ret = [] - for token_id in no_brackets.split(","): - try: - ret.append(int(token_id)) - except ValueError: - # Ignore non-numbers. Rascals! - pass - return [ret] - - # Handle direct phrases - if phrase.startswith("{") and phrase.endswith("}"): - no_brackets = phrase[1:-1] - return [HACK_currentmodel.tokenizer.encode(no_brackets)] - - # Handle untamperable phrases - if not self._allow_leftwards_tampering(phrase): - return [HACK_currentmodel.tokenizer.encode(phrase)] - - # Handle slight alterations to original phrase - phrase = phrase.strip(" ") - ret = [] - - for alt_phrase in [phrase, f" {phrase}"]: - ret.append(HACK_currentmodel.tokenizer.encode(alt_phrase)) - - return ret - - def _get_biased_tokens(self, input_ids: List) -> Dict: - # TODO: Different "bias slopes"? - - ret = {} - for phrase, _bias in utils.koboldai_vars.biases.items(): - bias_score, completion_threshold = _bias - token_seqs = self._get_token_sequence(phrase) - variant_deltas = {} - for token_seq in token_seqs: - bias_index = self._find_intersection(input_ids, token_seq) - - # Ensure completion after completion_threshold tokens - # Only provide a positive bias when the base bias score is positive. - if bias_score > 0 and bias_index + 1 > completion_threshold: - bias_score = 999 - - token_to_bias = token_seq[bias_index] - variant_deltas[token_to_bias] = bias_score - - # If multiple phrases bias the same token, add the modifiers - # together. This should NOT be applied to automatic variants - for token_to_bias, bias_score in variant_deltas.items(): - if token_to_bias in ret: - ret[token_to_bias] += bias_score - else: - ret[token_to_bias] = bias_score - return ret - - def __call__( - self, input_ids: torch.LongTensor, scores: torch.FloatTensor - ) -> torch.FloatTensor: - assert scores.ndim == 2 - assert input_ids.ndim == 2 - - scores_shape = scores.shape - - for batch in range(scores_shape[0]): - for token, bias in self._get_biased_tokens( - input_ids[batch] - ).items(): - scores[batch][token] += bias - - return scores - - class LuaLogitsProcessor(LogitsProcessor): - def __init__(self): - pass - - def __call__( - self, input_ids: torch.LongTensor, scores: torch.FloatTensor - ) -> torch.FloatTensor: - assert scores.ndim == 2 - assert input_ids.ndim == 2 - self.regeneration_required = False - self.halt = False - - if utils.koboldai_vars.standalone: - return scores - - scores_shape = scores.shape - scores_list = scores.tolist() - utils.koboldai_vars.lua_koboldbridge.logits = ( - utils.koboldai_vars.lua_state.table() - ) - for r, row in enumerate(scores_list): - utils.koboldai_vars.lua_koboldbridge.logits[ - r + 1 - ] = utils.koboldai_vars.lua_state.table(*row) - utils.koboldai_vars.lua_koboldbridge.vocab_size = scores_shape[-1] - - utils.koboldai_vars.lua_koboldbridge.execute_genmod() - - scores = torch.tensor( - tuple( - tuple(row.values()) - for row in utils.koboldai_vars.lua_koboldbridge.logits.values() - ), - device=scores.device, - dtype=scores.dtype, - ) - assert scores.shape == scores_shape - - return scores - - from torch.nn import functional as F - - def visualize_probabilities( - model: InferenceModel, - scores: torch.FloatTensor, - ) -> None: - assert scores.ndim == 2 - - if utils.koboldai_vars.numseqs > 1 or not utils.koboldai_vars.show_probs: - return - - if not utils.koboldai_vars.show_probs: - return scores - - option_offset = 0 - if ( - utils.koboldai_vars.actions.action_count + 1 - in utils.koboldai_vars.actions.actions - ): - for x in range( - len( - utils.koboldai_vars.actions.actions[ - utils.koboldai_vars.actions.action_count + 1 - ]["Options"] - ) - ): - option = utils.koboldai_vars.actions.actions[ - utils.koboldai_vars.actions.action_count + 1 - ]["Options"][x] - if ( - option["Pinned"] - or option["Previous Selection"] - or option["Edited"] - ): - option_offset = x + 1 - batch_offset = ( - int( - (utils.koboldai_vars.generated_tkns - 1) - / utils.koboldai_vars.genamt - ) - if utils.koboldai_vars.alt_multi_gen - else 0 - ) - for batch_index, batch in enumerate(scores): - probs = F.softmax(batch, dim=-1).cpu().numpy() - - token_prob_info = [] - for token_id, score in sorted( - enumerate(probs), key=lambda x: x[1], reverse=True - )[:8]: - token_prob_info.append( - { - "tokenId": token_id, - "decoded": utils.decodenewlines( - model.tokenizer.decode(token_id) - ), - "score": float(score), - } - ) - - if utils.koboldai_vars.numseqs == 1: - utils.koboldai_vars.actions.set_probabilities(token_prob_info) - else: - utils.koboldai_vars.actions.set_option_probabilities( - token_prob_info, batch_index + option_offset + batch_offset - ) - - return scores - - def new_get_logits_processor(*args, **kwargs) -> LogitsProcessorList: - processors = new_get_logits_processor.old_get_logits_processor( - *args, **kwargs - ) - # TODOB4MERGE: These two - # processors.insert(0, LuaLogitsProcessor()) - # processors.append(PhraseBiasLogitsProcessor()) - return processors - - use_core_manipulations.get_logits_processor = new_get_logits_processor - new_get_logits_processor.old_get_logits_processor = ( - transformers.GenerationMixin._get_logits_processor - ) - - class KoboldLogitsWarperList(LogitsProcessorList): - def __init__(self): - pass - - def __call__( - lw_self, - input_ids: torch.LongTensor, - scores: torch.FloatTensor, - *args, - **kwargs, - ): - # sampler_order = utils.koboldai_vars.sampler_order[:] - # if ( - # len(sampler_order) < 7 - # ): # Add repetition penalty at beginning if it's not present - # sampler_order = [6] + sampler_order - # for k in sampler_order: - # scores = self.__warper_list[k](input_ids, scores, *args, **kwargs) - scores = self._apply_warpers(scores=scores, input_ids=input_ids) - visualize_probabilities(HACK_currentmodel, scores) - return scores - - def new_get_logits_warper( - beams: int = 1, - ) -> LogitsProcessorList: - return KoboldLogitsWarperList() - - def new_sample(self, *args, **kwargs): - assert kwargs.pop("logits_warper", None) is not None - kwargs["logits_warper"] = new_get_logits_warper( - beams=1, - ) - if utils.koboldai_vars.newlinemode in ["s", "ns"]: - kwargs["eos_token_id"] = -1 - kwargs.setdefault("pad_token_id", 2) - return new_sample.old_sample(self, *args, **kwargs) - - new_sample.old_sample = transformers.GenerationMixin.sample - use_core_manipulations.sample = new_sample - - def _raw_generate( - self, - prompt_tokens: Union[List[int], torch.Tensor], - max_new: int, - gen_settings: GenerationSettings, - single_line: bool = False, - batch_count: int = 1, - ) -> GenerationResult: - if not isinstance(prompt_tokens, torch.Tensor): - gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] - else: - gen_in = prompt_tokens - - device = utils.get_auxilary_device() - gen_in = gen_in.to(device) - - additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else [] - - with torch.no_grad(): - start_time = time.time() - genout = self.model.generate( - gen_in, - do_sample=True, - max_length=min( - len(prompt_tokens) + max_new, utils.koboldai_vars.max_length - ), - repetition_penalty=1.0, - bad_words_ids=utils.koboldai_vars.badwordsids - + additional_bad_words_ids, - use_cache=True, - num_return_sequences=batch_count, - ) - logger.debug( - "torch_raw_generate: run generator {}s".format(time.time() - start_time) - ) - - return GenerationResult( - self, - out_batches=genout, - prompt=prompt_tokens, - is_whole_generation=False, - output_includes_prompt=True, - ) - - def _get_model(self, location: str, tf_kwargs: Dict): - try: - return AutoModelForCausalLM.from_pretrained( - location, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - **tf_kwargs, - ) - except Exception as e: - if "out of memory" in traceback.format_exc().lower(): - raise RuntimeError( - "One of your GPUs ran out of memory when KoboldAI tried to load your model." - ) - return GPTNeoForCausalLM.from_pretrained( - location, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - **tf_kwargs, - ) - - def get_hidden_size(self) -> int: - return self.model.get_input_embeddings().embedding_dim - - def _move_to_devices(self) -> None: - if not utils.koboldai_vars.breakmodel: - if utils.koboldai_vars.usegpu: - self.model = self.model.half().to(utils.koboldai_vars.gpu_device) - else: - self.model = self.model.to("cpu").float() - return - - for key, value in self.model.state_dict().items(): - target_dtype = ( - torch.float32 if breakmodel.primary_device == "cpu" else torch.float16 - ) - if value.dtype is not target_dtype: - accelerate.utils.set_module_tensor_to_device( - self.model, key, target_dtype - ) - - disk_blocks = breakmodel.disk_blocks - gpu_blocks = breakmodel.gpu_blocks - ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - device_map = {} - - for name in utils.layers_module_names: - layer = int(name.rsplit(".", 1)[1]) - device = ( - ("disk" if layer < disk_blocks else "cpu") - if layer < ram_blocks - else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) - ) - device_map[name] = device - - for name in utils.get_missing_module_names(self.model, list(device_map.keys())): - device_map[name] = breakmodel.primary_device - - breakmodel.dispatch_model_ex( - self.model, - device_map, - main_device=breakmodel.primary_device, - offload_buffers=True, - offload_dir="accelerate-disk-cache", - ) - - gc.collect() - return - - # Function to patch transformers to use our soft prompt - def patch_embedding(self) -> None: - if getattr(Embedding, "_koboldai_patch_causallm_model", None): - Embedding._koboldai_patch_causallm_model = self.model - return - - old_embedding_call = Embedding.__call__ - - kai_model = self - - def new_embedding_call(self, input_ids, *args, **kwargs): - # Don't touch embeddings for models other than the core inference model (that's us!) - if ( - Embedding._koboldai_patch_causallm_model.get_input_embeddings() - is not self - ): - return old_embedding_call(self, input_ids, *args, **kwargs) - - assert input_ids is not None - - if utils.koboldai_vars.sp is not None: - shifted_input_ids = input_ids - kai_model.model.config.vocab_size - - input_ids.clamp_(max=kai_model.model.config.vocab_size - 1) - inputs_embeds = old_embedding_call(self, input_ids, *args, **kwargs) - - if utils.koboldai_vars.sp is not None: - utils.koboldai_vars.sp = utils.koboldai_vars.sp.to( - inputs_embeds.dtype - ).to(inputs_embeds.device) - inputs_embeds = torch.where( - (shifted_input_ids >= 0)[..., None], - utils.koboldai_vars.sp[shifted_input_ids.clamp(min=0)], - inputs_embeds, - ) - - return inputs_embeds - - Embedding.__call__ = new_embedding_call - Embedding._koboldai_patch_causallm_model = self.model - - def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True): - if not self.lazy_load: - return - - if utils.args.breakmodel_disklayers is not None: - breakmodel.disk_blocks = utils.args.breakmodel_disklayers - - disk_blocks = breakmodel.disk_blocks - gpu_blocks = breakmodel.gpu_blocks - ram_blocks = ram_blocks = n_layers - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - - def lazy_load_callback( - model_dict: Dict[str, Union[torch_lazy_loader.LazyTensor, torch.Tensor]], - f, - **_, - ): - if lazy_load_callback.nested: - return - lazy_load_callback.nested = True - - device_map: Dict[str, Union[str, int]] = {} - - @functools.lru_cache(maxsize=None) - def get_original_key(key): - return max( - ( - original_key - for original_key in utils.module_names - if original_key.endswith(key) - ), - key=len, - ) - - for key, value in model_dict.items(): - original_key = get_original_key(key) - if isinstance(value, torch_lazy_loader.LazyTensor) and not any( - original_key.startswith(n) for n in utils.layers_module_names - ): - device_map[key] = ( - utils.koboldai_vars.gpu_device - if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu - else "cpu" - if not utils.koboldai_vars.hascuda - or not utils.koboldai_vars.breakmodel - else breakmodel.primary_device - ) - else: - layer = int( - max( - ( - n - for n in utils.layers_module_names - if original_key.startswith(n) - ), - key=len, - ).rsplit(".", 1)[1] - ) - device = ( - utils.koboldai_vars.gpu_device - if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu - else "disk" - if layer < disk_blocks and layer < ram_blocks - else "cpu" - if not utils.koboldai_vars.hascuda - or not utils.koboldai_vars.breakmodel - else "shared" - if layer < ram_blocks - else bisect.bisect_right( - cumulative_gpu_blocks, layer - ram_blocks - ) - ) - device_map[key] = device - - if utils.num_shards is None or utils.current_shard == 0: - utils.offload_index = {} - if os.path.isdir("accelerate-disk-cache"): - # Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder - # (the folder doesn't contain any subfolders so os.remove will do just fine) - for filename in os.listdir("accelerate-disk-cache"): - try: - os.remove(os.path.join("accelerate-disk-cache", filename)) - except OSError: - pass - os.makedirs("accelerate-disk-cache", exist_ok=True) - if utils.num_shards is not None: - num_tensors = len( - utils.get_sharded_checkpoint_num_tensors( - utils.from_pretrained_model_name, - utils.from_pretrained_index_filename, - **utils.from_pretrained_kwargs, - ) - ) - else: - num_tensors = len(device_map) - print(flush=True) - utils.koboldai_vars.status_message = "Loading model" - utils.koboldai_vars.total_layers = num_tensors - utils.koboldai_vars.loaded_layers = 0 - utils.bar = tqdm( - total=num_tensors, - desc="Loading model tensors", - file=utils.UIProgressBarFile(), - ) - - with zipfile.ZipFile(f, "r") as z: - try: - last_storage_key = None - zipfolder = os.path.basename(os.path.normpath(f)).split(".")[0] - f = None - current_offset = 0 - able_to_pin_layers = True - if utils.num_shards is not None: - utils.current_shard += 1 - for key in sorted( - device_map.keys(), - key=lambda k: (model_dict[k].key, model_dict[k].seek_offset), - ): - storage_key = model_dict[key].key - if ( - storage_key != last_storage_key - or model_dict[key].seek_offset < current_offset - ): - last_storage_key = storage_key - if isinstance(f, zipfile.ZipExtFile): - f.close() - try: - f = z.open(f"archive/data/{storage_key}") - except: - f = z.open(f"{zipfolder}/data/{storage_key}") - current_offset = 0 - if current_offset != model_dict[key].seek_offset: - f.read(model_dict[key].seek_offset - current_offset) - current_offset = model_dict[key].seek_offset - device = device_map[key] - size = functools.reduce( - lambda x, y: x * y, model_dict[key].shape, 1 - ) - dtype = model_dict[key].dtype - nbytes = ( - size - if dtype is torch.bool - else size - * ( - ( - torch.finfo - if dtype.is_floating_point - else torch.iinfo - )(dtype).bits - >> 3 - ) - ) - # print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) - model_dict[key] = model_dict[key].materialize( - f, map_location="cpu" - ) - if model_dict[key].dtype is torch.float32: - utils.koboldai_vars.fp32_model = True - if ( - convert_to_float16 - and breakmodel.primary_device != "cpu" - and utils.koboldai_vars.hascuda - and ( - utils.koboldai_vars.breakmodel - or utils.koboldai_vars.usegpu - ) - and model_dict[key].dtype is torch.float32 - ): - model_dict[key] = model_dict[key].to(torch.float16) - if breakmodel.primary_device == "cpu" or ( - not utils.koboldai_vars.usegpu - and not utils.koboldai_vars.breakmodel - and model_dict[key].dtype is torch.float16 - ): - model_dict[key] = model_dict[key].to(torch.float32) - if device == "shared": - model_dict[key] = model_dict[key].to("cpu").detach_() - if able_to_pin_layers: - try: - model_dict[key] = model_dict[key].pin_memory() - except: - able_to_pin_layers = False - elif device == "disk": - accelerate.utils.offload_weight( - model_dict[key], - get_original_key(key), - "accelerate-disk-cache", - index=utils.offload_index, - ) - model_dict[key] = model_dict[key].to("meta") - else: - model_dict[key] = model_dict[key].to(device) - # print("OK", flush=True) - current_offset += nbytes - utils.bar.update(1) - utils.koboldai_vars.loaded_layers += 1 - finally: - if ( - utils.num_shards is None - or utils.current_shard >= utils.num_shards - ): - if utils.offload_index: - for name, tensor in utils.named_buffers: - dtype = tensor.dtype - if ( - convert_to_float16 - and breakmodel.primary_device != "cpu" - and utils.koboldai_vars.hascuda - and ( - utils.koboldai_vars.breakmodel - or utils.koboldai_vars.usegpu - ) - ): - dtype = torch.float16 - if breakmodel.primary_device == "cpu" or ( - not utils.koboldai_vars.usegpu - and not utils.koboldai_vars.breakmodel - ): - dtype = torch.float32 - if ( - name in model_dict - and model_dict[name].dtype is not dtype - ): - model_dict[name] = model_dict[name].to(dtype) - if tensor.dtype is not dtype: - tensor = tensor.to(dtype) - if name not in utils.offload_index: - accelerate.utils.offload_weight( - tensor, - name, - "accelerate-disk-cache", - index=utils.offload_index, - ) - accelerate.utils.save_offload_index( - utils.offload_index, "accelerate-disk-cache" - ) - utils.bar.close() - utils.bar = None - utils.koboldai_vars.status_message = "" - lazy_load_callback.nested = False - if isinstance(f, zipfile.ZipExtFile): - f.close() - - lazy_load_callback.nested = False - return lazy_load_callback - - @contextlib.contextmanager - def _maybe_use_float16(self, always_use: bool = False): - if always_use or ( - utils.koboldai_vars.hascuda - and self.low_mem - and (utils.koboldai_vars.usegpu or utils.koboldai_vars.breakmodel) - ): - original_dtype = torch.get_default_dtype() - torch.set_default_dtype(torch.float16) - yield True - torch.set_default_dtype(original_dtype) - else: - yield False - - def breakmodel_device_list(self, n_layers, primary=None, selected=None): - # TODO: Find a better place for this or rework this - - device_count = torch.cuda.device_count() - if device_count < 2: - primary = None - gpu_blocks = breakmodel.gpu_blocks + ( - device_count - len(breakmodel.gpu_blocks) - ) * [0] - print(f"{colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{colors.END}") - for i in range(device_count): - name = torch.cuda.get_device_name(i) - if len(name) > 47: - name = "..." + name[-44:] - row_color = colors.END - sep_color = colors.YELLOW - print( - f"{row_color}{colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{colors.END}" - ) - row_color = colors.END - sep_color = colors.YELLOW - print( - f"{row_color}{colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){colors.END}" - ) - print( - f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){colors.END}" - ) - - def breakmodel_device_config(self, config): - # TODO: Find a better place for this or rework this - - global breakmodel, generator - import breakmodel - - n_layers = utils.num_layers(config) - - if utils.args.cpu: - breakmodel.gpu_blocks = [0] * n_layers - return - - elif ( - utils.args.breakmodel_gpulayers is not None - or utils.args.breakmodel_disklayers is not None - ): - try: - if not utils.args.breakmodel_gpulayers: - breakmodel.gpu_blocks = [] - else: - breakmodel.gpu_blocks = list( - map(int, utils.args.breakmodel_gpulayers.split(",")) - ) - assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count() - s = n_layers - for i in range(len(breakmodel.gpu_blocks)): - if breakmodel.gpu_blocks[i] <= -1: - breakmodel.gpu_blocks[i] = s - break - else: - s -= breakmodel.gpu_blocks[i] - assert sum(breakmodel.gpu_blocks) <= n_layers - n_layers -= sum(breakmodel.gpu_blocks) - if utils.args.breakmodel_disklayers is not None: - assert utils.args.breakmodel_disklayers <= n_layers - breakmodel.disk_blocks = utils.args.breakmodel_disklayers - n_layers -= utils.args.breakmodel_disklayers - except: - logger.warning( - "--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0." - ) - breakmodel.gpu_blocks = [n_layers] - n_layers = 0 - elif utils.args.breakmodel_layers is not None: - breakmodel.gpu_blocks = [ - n_layers - max(0, min(n_layers, utils.args.breakmodel_layers)) - ] - n_layers -= sum(breakmodel.gpu_blocks) - elif utils.args.model is not None: - logger.info("Breakmodel not specified, assuming GPU 0") - breakmodel.gpu_blocks = [n_layers] - n_layers = 0 - else: - device_count = torch.cuda.device_count() - if device_count > 1: - print( - colors.CYAN - + "\nPlease select one of your GPUs to be your primary GPU." - ) - print( - "VRAM usage in your primary GPU will be higher than for your other ones." - ) - print("It is recommended you make your fastest GPU your primary GPU.") - self.breakmodel_device_list(n_layers) - while True: - primaryselect = input("device ID> ") - if ( - primaryselect.isnumeric() - and 0 <= int(primaryselect) < device_count - ): - breakmodel.primary_device = int(primaryselect) - break - else: - print( - f"{colors.RED}Please enter an integer between 0 and {device_count-1}.{colors.END}" - ) - else: - breakmodel.primary_device = 0 - - print( - colors.PURPLE - + "\nIf you don't have enough VRAM to run the model on a single GPU" - ) - print( - "you can split the model between your CPU and your GPU(s), or between" - ) - print("multiple GPUs if you have more than one.") - print("By putting more 'layers' on a GPU or CPU, more computations will be") - print( - "done on that device and more VRAM or RAM will be required on that device" - ) - print("(roughly proportional to number of layers).") - print( - "It should be noted that GPUs are orders of magnitude faster than the CPU." - ) - print( - f"This model has{colors.YELLOW} {n_layers} {colors.PURPLE}layers.{colors.END}\n" - ) - - for i in range(device_count): - self.breakmodel_device_list( - n_layers, primary=breakmodel.primary_device, selected=i - ) - print( - f"{colors.CYAN}\nHow many of the remaining{colors.YELLOW} {n_layers} {colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{colors.END}\n" - ) - while True: - layerselect = input("# of layers> ") - if ( - layerselect.isnumeric() or layerselect.strip() == "-1" - ) and -1 <= int(layerselect) <= n_layers: - layerselect = int(layerselect) - layerselect = n_layers if layerselect == -1 else layerselect - breakmodel.gpu_blocks.append(layerselect) - n_layers -= layerselect - break - else: - print( - f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}" - ) - if n_layers == 0: - break - - if n_layers > 0: - self.breakmodel_device_list( - n_layers, primary=breakmodel.primary_device, selected=-1 - ) - print( - f"{colors.CYAN}\nHow many of the remaining{colors.YELLOW} {n_layers} {colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{colors.END}\n" - ) - while True: - layerselect = input("# of layers> ") - if ( - layerselect.isnumeric() or layerselect.strip() == "-1" - ) and -1 <= int(layerselect) <= n_layers: - layerselect = int(layerselect) - layerselect = n_layers if layerselect == -1 else layerselect - breakmodel.disk_blocks = layerselect - n_layers -= layerselect - break - else: - print( - f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}" - ) - - logger.init_ok("Final device configuration:", status="Info") - self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device) - - # If all layers are on the same device, use the old GPU generation mode - while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0: - breakmodel.gpu_blocks.pop() - if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in ( - -1, - utils.num_layers(config), - ): - utils.koboldai_vars.breakmodel = False - utils.koboldai_vars.usegpu = True - utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1 - return - - if not breakmodel.gpu_blocks: - logger.warning("Nothing assigned to a GPU, reverting to CPU only mode") - import breakmodel - - breakmodel.primary_device = "cpu" - utils.koboldai_vars.breakmodel = False - utils.koboldai_vars.usegpu = False - return - - -class GenericHFTorchInferenceModel(HFTorchInferenceModel): - def _load(self, save_model: bool, initial_load: bool) -> None: - utils.koboldai_vars.allowsp = True - - # Make model path the same as the model name to make this consistent - # with the other loading method if it isn't a known model type. This - # code is not just a workaround for below, it is also used to make the - # behavior consistent with other loading methods - Henk717 - # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]: - # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model - - if utils.koboldai_vars.model == "NeoCustom": - utils.koboldai_vars.model = os.path.basename( - os.path.normpath(utils.koboldai_vars.custmodpth) - ) - - # If we specify a model and it's in the root directory, we need to move - # it to the models directory (legacy folder structure to new) - if self.get_local_model_path(legacy=True): - shutil.move( - self.get_local_model_path(legacy=True, ignore_existance=True), - self.get_local_model_path(ignore_existance=True), - ) - - self.init_model_config() - - tf_kwargs = { - "low_cpu_mem_usage": True, - } - - if utils.koboldai_vars.model_type == "gpt2": - # We must disable low_cpu_mem_usage and if using a GPT-2 model - # because GPT-2 is not compatible with this feature yet. - tf_kwargs.pop("low_cpu_mem_usage", None) - - # Also, lazy loader doesn't support GPT-2 models - utils.koboldai_vars.lazy_load = False - - # If we're using torch_lazy_loader, we need to get breakmodel config - # early so that it knows where to load the individual model tensors - if ( - utils.koboldai_vars.lazy_load - and utils.koboldai_vars.hascuda - and utils.koboldai_vars.breakmodel - and not utils.koboldai_vars.nobreakmodel - ): - self.breakmodel_device_config(self.model_config) - - if utils.koboldai_vars.lazy_load: - # If we're using lazy loader, we need to figure out what the model's hidden layers are called - with torch_lazy_loader.use_lazy_torch_load( - dematerialized_modules=True, use_accelerate_init_empty_weights=True - ): - try: - metamodel = AutoModelForCausalLM.from_config(self.model_config) - except Exception as e: - metamodel = GPTNeoForCausalLM.from_config(self.model_config) - utils.layers_module_names = utils.get_layers_module_names(metamodel) - utils.module_names = list(metamodel.state_dict().keys()) - utils.named_buffers = list(metamodel.named_buffers(recurse=True)) - - # Download model from Huggingface if it does not exist, otherwise load locally - with self._maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load( - enable=utils.koboldai_vars.lazy_load, - callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) - if utils.koboldai_vars.lazy_load - else None, - dematerialized_modules=True, - ): - if utils.koboldai_vars.lazy_load: - # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time - tf_kwargs.pop("low_cpu_mem_usage", None) - - if self.get_local_model_path(): - # Model is stored locally, load it. - self.model = self._get_model(self.get_local_model_path(), tf_kwargs) - self.tokenizer = self._get_tokenizer(self.get_local_model_path()) - else: - # Model not stored locally, we need to download it. - - # _rebuild_tensor patch for casting dtype and supporting LazyTensors - old_rebuild_tensor = torch._utils._rebuild_tensor - - def new_rebuild_tensor( - storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], - storage_offset, - shape, - stride, - ): - if not isinstance(storage, torch_lazy_loader.LazyTensor): - dtype = storage.dtype - else: - dtype = storage.storage_type.dtype - if not isinstance(dtype, torch.dtype): - dtype = storage.storage_type(0).dtype - if dtype is torch.float32 and len(shape) >= 2: - utils.koboldai_vars.fp32_model = True - return old_rebuild_tensor(storage, storage_offset, shape, stride) - - torch._utils._rebuild_tensor = new_rebuild_tensor - self.model = self._get_model(utils.koboldai_vars.model, tf_kwargs) - self.tokenizer = self._get_tokenizer(utils.koboldai_vars.model) - torch._utils._rebuild_tensor = old_rebuild_tensor - - if save_model: - self.tokenizer.save_pretrained( - self.get_local_model_path(ignore_existance=True) - ) - - if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks: - # Use save_pretrained to convert fp32 models to fp16, - # unless we are using disk cache because save_pretrained - # is not supported in that case - model = model.half() - model.save_pretrained( - self.get_local_model_path(ignore_existance=True), - max_shard_size="500MiB", - ) - - else: - # For fp16 models, we can just copy the model files directly - import transformers.configuration_utils - import transformers.modeling_utils - import transformers.file_utils - import huggingface_hub - - legacy = packaging.version.parse( - transformers_version - ) < packaging.version.parse("4.22.0.dev0") - # Save the config.json - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - utils.koboldai_vars.model, - transformers.configuration_utils.CONFIG_NAME, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=legacy, - ) - ), - os.path.join( - self.get_local_model_path(ignore_existance=True), - transformers.configuration_utils.CONFIG_NAME, - ), - ) - - if utils.num_shards is None: - # Save the pytorch_model.bin or model.safetensors of an unsharded model - for possible_weight_name in [ - transformers.modeling_utils.WEIGHTS_NAME, - "model.safetensors", - ]: - try: - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - utils.koboldai_vars.model, - possible_weight_name, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=legacy, - ) - ), - os.path.join( - self.get_local_model_path( - ignore_existance=True - ), - possible_weight_name, - ), - ) - except Exception as e: - if possible_weight_name == "model.safetensors": - raise e - else: - # Handle saving sharded models - - with open(utils.from_pretrained_index_filename) as f: - map_data = json.load(f) - filenames = set(map_data["weight_map"].values()) - # Save the pytorch_model.bin.index.json of a sharded model - shutil.move( - os.path.realpath(utils.from_pretrained_index_filename), - os.path.join( - self.get_local_model_path(ignore_existance=True), - transformers.modeling_utils.WEIGHTS_INDEX_NAME, - ), - ) - # Then save the pytorch_model-#####-of-#####.bin files - for filename in filenames: - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - utils.koboldai_vars.model, - filename, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=legacy, - ) - ), - os.path.join( - self.get_local_model_path( - ignore_existance=True - ), - filename, - ), - ) - shutil.rmtree("cache/") - - if ( - utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default - and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj") - ): - utils.koboldai_vars.badwordsids = [ - [v] - for k, v in self.tokenizer.get_vocab().items() - if any(c in str(k) for c in "<>[]") - if utils.koboldai_vars.newlinemode != "s" or str(k) != "" - ] - - self.patch_embedding() - - if utils.koboldai_vars.hascuda: - if utils.koboldai_vars.usegpu: - # Use just VRAM - self.model = self.model.half().to(utils.koboldai_vars.gpu_device) - elif utils.koboldai_vars.breakmodel: - # Use both RAM and VRAM (breakmodel) - if not utils.koboldai_vars.lazy_load: - self.breakmodel_device_config(model.config) - self._move_to_devices() - elif breakmodel.disk_blocks > 0: - # Use disk - self._move_to_devices() - elif breakmodel.disk_blocks > 0: - self._move_to_devices() - else: - # Use CPU - self.model = self.model.to("cpu").float() - elif breakmodel.disk_blocks > 0: - self._move_to_devices() - else: - self.model = self.model.to("cpu").float() - self.model.kai_model = self - utils.koboldai_vars.modeldim = self.get_hidden_size() - - -class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel): - def _load(self, save_model: bool, initial_load: bool) -> None: - utils.koboldai_vars.lazy_load = False - - model_path = None - - for possible_config_path in [ - utils.koboldai_vars.custmodpth, - os.path.join("models", utils.koboldai_vars.custmodpth), - ]: - try: - with open( - os.path.join(possible_config_path, "config.json"), "r" - ) as file: - self.model_config = json.load(file) - model_path = possible_config_path - break - except FileNotFoundError: - pass - - if not model_path: - raise RuntimeError("Empty model_path!") - - with self._maybe_use_float16(): - try: - self.model = GPT2LMHeadModel.from_pretrained( - utils.koboldai_vars.custmodpth, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - ) - self.tokenizer = GPT2Tokenizer.from_pretrained( - utils.koboldai_vars.custmodpth, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - ) - except Exception as e: - if "out of memory" in traceback.format_exc().lower(): - raise RuntimeError( - "One of your GPUs ran out of memory when KoboldAI tried to load your model." - ) - raise e - - if save_model: - self.model.save_pretrained( - self.get_local_model_path(ignore_existance=True), - max_shard_size="500MiB", - ) - self.tokenizer.save_pretrained( - self.get_local_model_path(ignore_existance=True) - ) - - utils.koboldai_vars.modeldim = self.get_hidden_size() - - # Is CUDA available? If so, use GPU, otherwise fall back to CPU - if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu: - self.model = self.model.half().to(utils.koboldai_vars.gpu_device) - else: - self.model = self.model.to("cpu").float() - - self.patch_causal_lm() - - -class OpenAIAPIInferenceModel(InferenceModel): - def _load(self, save_model: bool, initial_load: bool) -> None: - self.tokenizer = self._get_tokenizer("gpt2") - - def _raw_generate( - self, - prompt_tokens: Union[List[int], torch.Tensor], - max_new: int, - gen_settings: GenerationSettings, - single_line: bool = False, - batch_count: int = 1, - ) -> GenerationResult: - # Taken mainly from oairequest() - - decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) - - # Store context in memory to use it for comparison with generated content - utils.koboldai_vars.lastctx = decoded_prompt - - # Build request JSON data - # GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround - # as the koboldai_vars.model will always be OAI - if "GooseAI" in utils.koboldai_vars.configname: - reqdata = { - "prompt": decoded_prompt, - "max_tokens": max_new, - "temperature": gen_settings.temp, - "top_a": gen_settings.top_a, - "top_p": gen_settings.top_p, - "top_k": gen_settings.top_k, - "tfs": gen_settings.tfs, - "typical_p": gen_settings.typical, - "repetition_penalty": gen_settings.rep_pen, - "repetition_penalty_slope": gen_settings.rep_pen_slope, - "repetition_penalty_range": gen_settings.rep_pen_range, - "n": batch_count, - # TODO: Implement streaming - "stream": False, - } - else: - reqdata = { - "prompt": decoded_prompt, - "max_tokens": max_new, - "temperature": gen_settings.temp, - "top_p": gen_settings.top_p, - "frequency_penalty": gen_settings.rep_pen, - "n": batch_count, - "stream": False, - } - - req = requests.post( - utils.koboldai_vars.oaiurl, - json=reqdata, - headers={ - "Authorization": "Bearer " + utils.koboldai_vars.oaiapikey, - "Content-Type": "application/json", - }, - ) - - j = req.json() - - if not req.ok: - # Send error message to web client - if "error" in j: - error_type = j["error"]["type"] - error_message = j["error"]["message"] - else: - error_type = "Unknown" - error_message = "Unknown" - raise OpenAIAPIError(error_type, error_message) - - outputs = [out["text"] for out in j["choices"]] - return GenerationResult( - model=self, - out_batches=np.array([self.tokenizer.encode(x) for x in outputs]), - prompt=prompt_tokens, - is_whole_generation=True, - single_line=single_line, - ) - - -class HordeInferenceModel(InferenceModel): - def _load(self, save_model: bool, initial_load: bool) -> None: - self.tokenizer = self._get_tokenizer( - utils.koboldai_vars.cluster_requested_models[0] - if len(utils.koboldai_vars.cluster_requested_models) > 0 - else "gpt2", - ) - - def _raw_generate( - self, - prompt_tokens: Union[List[int], torch.Tensor], - max_new: int, - gen_settings: GenerationSettings, - single_line: bool = False, - batch_count: int = 1, - ) -> GenerationResult: - decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) - - # Store context in memory to use it for comparison with generated content - utils.koboldai_vars.lastctx = decoded_prompt - - # Build request JSON data - reqdata = { - "max_length": max_new, - "max_context_length": utils.koboldai_vars.max_length, - "rep_pen": gen_settings.rep_pen, - "rep_pen_slope": gen_settings.rep_pen_slope, - "rep_pen_range": gen_settings.rep_pen_range, - "temperature": gen_settings.temp, - "top_p": gen_settings.top_p, - "top_k": int(gen_settings.top_k), - "top_a": gen_settings.top_a, - "tfs": gen_settings.tfs, - "typical": gen_settings.typical, - "n": batch_count, - } - - cluster_metadata = { - "prompt": decoded_prompt, - "params": reqdata, - "models": [x for x in utils.koboldai_vars.cluster_requested_models if x], - "trusted_workers": False, - } - - client_agent = "KoboldAI:2.0.0:koboldai.org" - cluster_headers = { - "apikey": utils.koboldai_vars.horde_api_key, - "Client-Agent": client_agent, - } - - try: - # Create request - req = requests.post( - utils.koboldai_vars.colaburl[:-8] + "/api/v2/generate/text/async", - json=cluster_metadata, - headers=cluster_headers, - ) - except requests.exceptions.ConnectionError: - errmsg = f"Horde unavailable. Please try again later" - logger.error(errmsg) - raise HordeException(errmsg) - - if req.status_code == 503: - errmsg = f"KoboldAI API Error: No available KoboldAI servers found in Horde to fulfil this request using the selected models or other properties." - logger.error(errmsg) - raise HordeException(errmsg) - elif not req.ok: - errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." - logger.error(errmsg) - logger.error(f"HTTP {req.status_code}!!!") - logger.error(req.text) - raise HordeException(errmsg) - - try: - req_status = req.json() - except requests.exceptions.JSONDecodeError: - errmsg = f"Unexpected message received from the Horde: '{req.text}'" - logger.error(errmsg) - raise HordeException(errmsg) - - request_id = req_status["id"] - logger.debug("Horde Request ID: {}".format(request_id)) - - # We've sent the request and got the ID back, now we need to watch it to see when it finishes - finished = False - - cluster_agent_headers = {"Client-Agent": client_agent} - - while not finished: - try: - req = requests.get( - f"{utils.koboldai_vars.colaburl[:-8]}/api/v2/generate/text/status/{request_id}", - headers=cluster_agent_headers, - ) - except requests.exceptions.ConnectionError: - errmsg = f"Horde unavailable. Please try again later" - logger.error(errmsg) - raise HordeException(errmsg) - - if not req.ok: - errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." - logger.error(req.text) - raise HordeException(errmsg) - - try: - req_status = req.json() - except requests.exceptions.JSONDecodeError: - errmsg = ( - f"Unexpected message received from the KoboldAI Horde: '{req.text}'" - ) - logger.error(errmsg) - raise HordeException(errmsg) - - if "done" not in req_status: - errmsg = f"Unexpected response received from the KoboldAI Horde: '{req_status}'" - logger.error(errmsg) - raise HordeException(errmsg) - - finished = req_status["done"] - utils.koboldai_vars.horde_wait_time = req_status["wait_time"] - utils.koboldai_vars.horde_queue_position = req_status["queue_position"] - utils.koboldai_vars.horde_queue_size = req_status["waiting"] - - if not finished: - logger.debug(req_status) - time.sleep(1) - - logger.debug("Last Horde Status Message: {}".format(req_status)) - - if req_status["faulted"]: - raise HordeException("Horde Text generation faulted! Please try again.") - - generations = req_status["generations"] - gen_servers = [(cgen["worker_name"], cgen["worker_id"]) for cgen in generations] - logger.info(f"Generations by: {gen_servers}") - - return GenerationResult( - model=self, - out_batches=np.array( - [self.tokenizer.encode(cgen["text"]) for cgen in generations] - ), - prompt=prompt_tokens, - is_whole_generation=True, - single_line=single_line, - ) - - -class ColabInferenceModel(InferenceModel): - def _load(self, save_model: bool, initial_load: bool) -> None: - self.tokenizer = self._get_tokenizer("EleutherAI/gpt-neo-2.7B") - - def _raw_generate( - self, - prompt_tokens: Union[List[int], torch.Tensor], - max_new: int, - gen_settings: GenerationSettings, - single_line: bool = False, - batch_count: int = 1, - ): - decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) - - # Store context in memory to use it for comparison with generated content - utils.koboldai_vars.lastctx = decoded_prompt - - # Build request JSON data - reqdata = { - "text": decoded_prompt, - "min": 0, - "max": max_new, - "rep_pen": gen_settings.rep_pen, - "rep_pen_slope": gen_settings.rep_pen_slope, - "rep_pen_range": gen_settings.rep_pen_range, - "temperature": gen_settings.temp, - "top_p": gen_settings.top_p, - "top_k": gen_settings.top_k, - "tfs": gen_settings.tfs, - "typical": gen_settings.typical, - "topa": gen_settings.top_a, - "numseqs": batch_count, - "retfultxt": False, - } - - # Create request - req = requests.post(utils.koboldai_vars.colaburl, json=reqdata) - - if req.status_code != 200: - raise ColabException(f"Bad status code {req.status_code}") - - # Deal with the response - js = req.json()["data"] - - # Try to be backwards compatible with outdated colab - if "text" in js: - genout = [utils.getnewcontent(js["text"], self.tokenizer)] - else: - genout = js["seqs"] - - return GenerationResult( - model=self, - out_batches=np.array([self.tokenizer.encode(x) for x in genout]), - prompt=prompt_tokens, - is_whole_generation=True, - single_line=single_line, - ) - - -class APIInferenceModel(InferenceModel): - def _load(self, save_model: bool, initial_load: bool) -> None: - tokenizer_id = requests.get( - utils.koboldai_vars.colaburl[:-8] + "/api/v1/model", - ).json()["result"] - self.tokenizer = self._get_tokenizer(tokenizer_id) - - def _raw_generate( - self, - prompt_tokens: Union[List[int], torch.Tensor], - max_new: int, - gen_settings: GenerationSettings, - single_line: bool = False, - batch_count: int = 1, - ): - decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) - - # Store context in memory to use it for comparison with generated content - utils.koboldai_vars.lastctx = decoded_prompt - - # Build request JSON data - reqdata = { - "prompt": decoded_prompt, - "max_length": max_new, - "max_context_length": utils.koboldai_vars.max_length, - "rep_pen": gen_settings.rep_pen, - "rep_pen_slope": gen_settings.rep_pen_slope, - "rep_pen_range": gen_settings.rep_pen_range, - "temperature": gen_settings.temp, - "top_p": gen_settings.top_p, - "top_k": gen_settings.top_k, - "top_a": gen_settings.top_a, - "tfs": gen_settings.tfs, - "typical": gen_settings.typical, - "n": batch_count, - } - - # Create request - while True: - req = requests.post( - utils.koboldai_vars.colaburl[:-8] + "/api/v1/generate", - json=reqdata, - ) - if ( - req.status_code == 503 - ): # Server is currently generating something else so poll until it's our turn - time.sleep(1) - continue - - js = req.json() - if req.status_code != 200: - logger.error(json.dumps(js, indent=4)) - raise APIException(f"Bad API status code {req.status_code}") - - genout = [obj["text"] for obj in js["results"]] - return GenerationResult( - model=self, - out_batches=np.array([self.tokenizer.encode(x) for x in genout]), - prompt=prompt_tokens, - is_whole_generation=True, - single_line=single_line, - ) diff --git a/modeling/inference_model.py b/modeling/inference_model.py new file mode 100644 index 00000000..8b7b9114 --- /dev/null +++ b/modeling/inference_model.py @@ -0,0 +1,591 @@ +from __future__ import annotations + +from dataclasses import dataclass +import time +from typing import List, Optional, Union +from logger import logger + +import torch +import numpy as np +import transformers +from transformers import ( + GPT2Tokenizer, + AutoTokenizer, +) + +import utils + +try: + import tpu_mtj_backend +except ModuleNotFoundError as e: + # Not on TPU... hopefully + if utils.koboldai_vars.use_colab_tpu: + raise e + +# I don't really like this way of pointing to the current model but I can't +# find a way around it in some areas. +current_model = None + +# We only want to use logit manipulations and such on our core text model +class use_core_manipulations: + """Use in a `with` block to patch functions for core story model sampling.""" + + # These must be set by wherever they get setup + get_logits_processor: callable = None + sample: callable = None + get_stopping_criteria: callable = None + + # We set these automatically + old_get_logits_processor: callable = None + old_sample: callable = None + old_get_stopping_criteria: callable = None + + def __enter__(self): + if use_core_manipulations.get_logits_processor: + use_core_manipulations.old_get_logits_processor = ( + transformers.GenerationMixin._get_logits_processor + ) + transformers.GenerationMixin._get_logits_processor = ( + use_core_manipulations.get_logits_processor + ) + + if use_core_manipulations.sample: + use_core_manipulations.old_sample = transformers.GenerationMixin.sample + transformers.GenerationMixin.sample = use_core_manipulations.sample + + if use_core_manipulations.get_stopping_criteria: + use_core_manipulations.old_get_stopping_criteria = ( + transformers.GenerationMixin._get_stopping_criteria + ) + transformers.GenerationMixin._get_stopping_criteria = ( + use_core_manipulations.get_stopping_criteria + ) + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + if use_core_manipulations.old_get_logits_processor: + transformers.GenerationMixin._get_logits_processor = ( + use_core_manipulations.old_get_logits_processor + ) + else: + assert ( + not use_core_manipulations.get_logits_processor + ), "Patch leak: THE MONKEYS HAVE ESCAPED" + + if use_core_manipulations.old_sample: + transformers.GenerationMixin.sample = use_core_manipulations.old_sample + else: + assert ( + not use_core_manipulations.sample + ), "Patch leak: THE MONKEYS HAVE ESCAPED" + + if use_core_manipulations.old_get_stopping_criteria: + transformers.GenerationMixin._get_stopping_criteria = ( + use_core_manipulations.old_get_stopping_criteria + ) + else: + assert ( + not use_core_manipulations.get_stopping_criteria + ), "Patch leak: THE MONKEYS HAVE ESCAPED" + + +class GenerationResult: + """A container for easily accessing different forms of model outputs. Returned by most generate functions.""" + + def __init__( + self, + model: InferenceModel, + out_batches: list, + prompt: list, + # Controls if generate() does it's looping thing. This should only be + # done for HF models that use that StoppingCondition + is_whole_generation: bool, + # Controls if we should trim output by prompt length + output_includes_prompt: bool = False, + # Lazy filter to cut off extra lines where we can't manipulate + # probabilities + single_line: bool = False, + ): + # Shave prompt off of encoded response when needed (HF). Decoded does + # not return prompt. + if output_includes_prompt: + self.encoded = out_batches[:, len(prompt) :] + else: + self.encoded = out_batches + + self.prompt = prompt + self.is_whole_generation = is_whole_generation + + self.decoded = [ + utils.decodenewlines(model.tokenizer.decode(enc)) for enc in self.encoded + ] + + if single_line: + self.decoded = [x.split("\n", 1)[0] for x in self.decoded] + self.encoded = np.array(model.tokenizer(self.decoded).input_ids) + + +class GenerationSettings: + """Structure for holding temporarily overwritten settings.""" + + def __init__(self, **overrides) -> None: + for setting in [ + "temp", + "top_p", + "top_k", + "tfs", + "typical", + "top_a", + "rep_pen", + "rep_pen_slope", + "rep_pen_range", + "sampler_order", + ]: + setattr( + self, + setting, + overrides.get(setting, getattr(utils.koboldai_vars, setting)), + ) + + +@dataclass +class ModelCapabilities: + embedding_manipulation: bool = False + post_token_hooks: bool = False + stopper_hooks: bool = False + # TODO: Support non-live probabilities from APIs + post_token_probs: bool = False + + +class InferenceModel: + """Root class for all models.""" + + def __init__(self) -> None: + self.gen_state = {} + self.post_token_hooks = [] + self.stopper_hooks = [] + self.tokenizer = None + self.capabilties = ModelCapabilities() + + def load(self, save_model: bool = False, initial_load: bool = False) -> None: + """User-facing load function. Do not override this; try `_load()` instead.""" + + self._load(save_model=save_model, initial_load=initial_load) + self._post_load() + + global current_model + current_model = self + + print(self.raw_generate("Hi guys,", 20).__dict__) + + def _post_load(self) -> None: + """Post load hook. Called after `_load()`.""" + + def _load(self, save_model: bool, initial_load: bool) -> None: + """Main load method. All logic related to loading the model onto the + selected device(s) and preparing it for inference should be implemented here.""" + raise NotImplementedError + + def _get_tokenizer(self, location: str) -> AutoTokenizer: + """Returns the appropiate tokenizer for the location. Should be ran once and result stored in `tokenizer`. + + Args: + location (str): Either a local model directory path or a HuggingFace model ID. + + Returns: + AutoTokenizer: Tokenizer deemed fit for the location string. May be a fallback tokenizer. + """ + if utils.koboldai_vars.model_type == "xglm": + # Default to newline mode if using XGLM + utils.koboldai_vars.newlinemode = "s" + elif utils.koboldai_vars.model_type in ["opt", "bloom"]: + # Handle but don't convert newlines if using Fairseq models that have newlines trained in them + utils.koboldai_vars.newlinemode = "ns" + + std_kwargs = {"revision": utils.koboldai_vars.revision, "cache_dir": "cache"} + + suppliers = [ + # Fast tokenizer disabled by default as per HF docs: + # > Note: Make sure to pass use_fast=False when loading + # OPT’s tokenizer with AutoTokenizer to get the correct + # tokenizer. + lambda: AutoTokenizer.from_pretrained( + location, use_fast=False, **std_kwargs + ), + lambda: AutoTokenizer.from_pretrained(location, **std_kwargs), + # Fallback to GPT2Tokenizer + lambda: GPT2Tokenizer.from_pretrained(location, **std_kwargs), + lambda: GPT2Tokenizer.from_pretrained("gpt2", **std_kwargs), + ] + + for i, try_get_tokenizer in enumerate(suppliers): + try: + return try_get_tokenizer() + except: + # If we error on each attempt, raise the last one + if i == len(suppliers) - 1: + raise + + def core_generate( + self, + text: list, + found_entries: set, + ): + """Generate story text. Heavily tied to story-specific parameters; if + you are making a new generation-based feature, consider `generate_raw()`. + + Args: + text (list): Encoded input tokens + found_entries (set): Entries found for Dynamic WI + + Raises: + RuntimeError: if inconsistancies are detected with the internal state and Lua state -- sanity check + RuntimeError: if inconsistancies are detected with the internal state and core stopper -- sanity check + """ + + start_time = time.time() + gen_in = torch.tensor(text, dtype=torch.long)[None] + logger.debug( + "core_generate: torch.tensor time {}s".format(time.time() - start_time) + ) + + start_time = time.time() + if utils.koboldai_vars.is_model_torch(): + # Torch stuff + if utils.koboldai_vars.full_determinism: + torch.manual_seed(utils.koboldai_vars.seed) + + if utils.koboldai_vars.sp is not None: + assert self.capabilties.embedding_manipulation + soft_tokens = torch.arange( + self.model.config.vocab_size, + self.model.config.vocab_size + utils.koboldai_vars.sp.shape[0], + ) + gen_in = torch.cat((soft_tokens[None], gen_in), dim=-1) + elif utils.koboldai_vars.use_colab_tpu: + if utils.koboldai_vars.full_determinism: + tpu_mtj_backend.set_rng_seed(utils.koboldai_vars.seed) + + logger.debug( + "core_generate: Model Setup (SP, etc) time {}s".format( + time.time() - start_time + ) + ) + + if ( + gen_in.shape[-1] + utils.koboldai_vars.genamt + > utils.koboldai_vars.max_length + ): + logger.error("gen_in.shape[-1]: {}".format(gen_in.shape[-1])) + logger.error( + "utils.koboldai_vars.genamt: {}".format(utils.koboldai_vars.genamt) + ) + logger.error( + "utils.koboldai_vars.max_length: {}".format( + utils.koboldai_vars.max_length + ) + ) + assert ( + gen_in.shape[-1] + utils.koboldai_vars.genamt + <= utils.koboldai_vars.max_length + ) + + start_time = time.time() + gen_in = gen_in.to(utils.get_auxilary_device()) + + logger.debug( + "core_generate: gen_in to device time {}s".format(time.time() - start_time) + ) + start_time = time.time() + + found_entries = found_entries or set() + + self.gen_state["wi_scanner_excluded_keys"] = found_entries + + utils.koboldai_vars._prompt = utils.koboldai_vars.prompt + + with torch.no_grad(): + already_generated = 0 + numseqs = utils.koboldai_vars.numseqs + total_gens = None + + for i in range( + utils.koboldai_vars.numseqs if utils.koboldai_vars.alt_multi_gen else 1 + ): + while True: + # The reason this is a loop is due to how Dynamic WI works. We + # cannot simply add the WI to the context mid-generation, so we + # stop early, and then insert WI, then continue generating. That + # stopping and continuing is this loop. + + start_time = time.time() + result = self.raw_generate( + gen_in[0], + max_new=utils.koboldai_vars.genamt, + do_streaming=utils.koboldai_vars.output_streaming, + do_dynamic_wi=utils.koboldai_vars.dynamicscan, + batch_count=numseqs + if not utils.koboldai_vars.alt_multi_gen + else 1, + # Real max length is handled by CoreStopper. + bypass_hf_maxlength=utils.koboldai_vars.dynamicscan, + is_core=True, + ) + logger.debug( + "core_generate: run raw_generate pass {} {}s".format( + already_generated, time.time() - start_time + ) + ) + + genout = result.encoded + + already_generated += len(genout[0]) + + try: + assert ( + already_generated + <= utils.koboldai_vars.genamt * utils.koboldai_vars.numseqs + if utils.koboldai_vars.alt_multi_gen + else 1 + ) + except AssertionError: + print("AlreadyGenerated", already_generated) + print("genamt", utils.koboldai_vars.genamt) + raise + + if result.is_whole_generation: + break + + # Generation stopped; why? + # If we have been told to halt, we have reached our target token + # amount (controlled by halt), or Dynamic WI has not told us to + # stop temporarily to insert WI, we can assume that we are done + # generating. We shall break. + if ( + self.gen_state["halt"] + or not self.gen_state["regeneration_required"] + ): + break + + # Now we are doing stuff for Dynamic WI. + assert genout.ndim >= 2 + assert genout.shape[0] == utils.koboldai_vars.numseqs + + if ( + utils.koboldai_vars.lua_koboldbridge.generated_cols + and utils.koboldai_vars.generated_tkns + != utils.koboldai_vars.lua_koboldbridge.generated_cols + ): + raise RuntimeError( + f"Inconsistency detected between KoboldAI Python and Lua backends ({utils.koboldai_vars.generated_tkns} != {utils.koboldai_vars.lua_koboldbridge.generated_cols})" + ) + + if already_generated != utils.koboldai_vars.generated_tkns: + print("already_generated: {}".format(already_generated)) + print( + "generated_tkns: {}".format( + utils.koboldai_vars.generated_tkns + ) + ) + raise RuntimeError("WI scanning error") + + for r in range(utils.koboldai_vars.numseqs): + for c in range(already_generated): + assert ( + utils.koboldai_vars.lua_koboldbridge.generated[r + 1][ + c + 1 + ] + is not None + ) + genout[r][ + genout.shape[-1] - already_generated + c + ] = utils.koboldai_vars.lua_koboldbridge.generated[r + 1][ + c + 1 + ] + + encoded = [] + + for i in range(utils.koboldai_vars.numseqs): + txt = utils.decodenewlines( + self.tokenizer.decode(genout[i, -already_generated:]) + ) + # winfo, mem, anotetxt, _found_entries = calcsubmitbudgetheader(txt, force_use_txt=True, actions=utils.koboldai_vars.actions) + # txt, _, _ = calcsubmitbudget(len(utils.koboldai_vars.actions), winfo, mem, anotetxt, utils.koboldai_vars.actions, submission=txt) + txt, _, _, _found_entries = utils.koboldai_vars.calc_ai_text( + submitted_text=txt, send_context=False + ) + found_entries[i].update(_found_entries) + encoded.append( + torch.tensor(txt, dtype=torch.long, device=genout.device) + ) + + max_length = len(max(encoded, key=len)) + encoded = torch.stack( + tuple( + torch.nn.functional.pad( + e, + (max_length - len(e), 0), + value=self.model.config.pad_token_id + or self.model.config.eos_token_id, + ) + for e in encoded + ) + ) + genout = torch.cat( + ( + encoded, + genout[..., -already_generated:], + ), + dim=-1, + ) + + if utils.koboldai_vars.sp is not None: + soft_tokens = torch.arange( + self.model.config.vocab_size, + self.model.config.vocab_size + + utils.koboldai_vars.sp.shape[0], + device=genout.device, + ) + genout = torch.cat( + (soft_tokens.tile(utils.koboldai_vars.numseqs, 1), genout), + dim=-1, + ) + + assert ( + genout.shape[-1] + + utils.koboldai_vars.genamt + - already_generated + <= utils.koboldai_vars.max_length + ) + gen_in = genout + numseqs = 1 + if total_gens is None: + total_gens = genout + else: + total_gens = torch.cat((total_gens, genout)) + + return total_gens, already_generated + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ) -> GenerationResult: + """Lowest level model-agnostic generation function. To be overridden by model implementation. + + Args: + prompt_tokens (Union[List[int], torch.Tensor]): Prompt as encoded token IDs + max_new (int): Maximum amount of new tokens to generate + gen_settings (GenerationSettings): State to pass in single-generation setting overrides + single_line (bool, optional): Generate one line only. Defaults to False. + batch_count (int, optional): How big of a batch to generate. Defaults to 1. + + Returns: + GenerationResult: The model's output + """ + raise NotImplementedError + + def raw_generate( + self, + # prompt is either a string (text) or a list (token ids) + prompt: Union[str, list, np.ndarray], + max_new: int, + do_streaming: bool = False, + do_dynamic_wi: bool = False, + batch_count: int = 1, + bypass_hf_maxlength: bool = False, + generation_settings: Optional[dict] = None, + is_core: bool = False, + single_line: bool = False, + found_entries: set = (), + ) -> GenerationResult: + """A wrapper around `_raw_generate()` that handles gen_state and other stuff. Use this to generate text outside of the story. + + Args: + prompt (Union[str, list, np.ndarray]): The prompt as a string or encoded token IDs + max_new (int): Maximum amount of new tokens to generate + do_streaming (bool, optional): Whether to stream tokens to the user or not. Defaults to False. + do_dynamic_wi (bool, optional): Whether to use Dynamic WI context injections. Defaults to False. + batch_count (int, optional): How big of a batch to generate. Defaults to 1. + bypass_hf_maxlength (bool, optional): Whether to ignore model-provided max length limits. Defaults to False. + generation_settings (GenerationSettings): State to pass in single-generation setting overrides. Defaults to None + is_core (bool, optional): Whether this generation is a core story generation. Defaults to False. + single_line (bool, optional): Generate one line only.. Defaults to False. + found_entries (set, optional): Entries found for Dynamic WI. Defaults to (). + + Raises: + ValueError: If prompt type is weird + NotImplementedError: If model is ReadOnly + + Returns: + GenerationResult: The model's output + """ + # TODO: Support singleline outside of torch + + self.gen_state["do_streaming"] = do_streaming + self.gen_state["do_dynamic_wi"] = do_dynamic_wi + + # Dynamic WI depends on this!!! This is a main gen call. + self.gen_state["stop_at_genamt"] = do_dynamic_wi + + # Makes stopping criteria hook happy + self.gen_state["wi_scanner_excluded_keys"] = self.gen_state.get( + "wi_scanner_excluded_keys", set() + ) + + utils.koboldai_vars.inference_config.do_core = is_core + gen_settings = GenerationSettings(*(generation_settings or {})) + + if isinstance(prompt, torch.Tensor): + prompt_tokens = prompt.cpu().numpy() + elif isinstance(prompt, list): + prompt_tokens = np.array(prompt) + elif isinstance(prompt, str): + prompt_tokens = np.array(self.tokenizer.encode(prompt)) + else: + raise ValueError(f"Prompt is {type(prompt)}. Not a fan!") + + assert isinstance(prompt_tokens, np.ndarray) + assert len(prompt_tokens.shape) == 1 + + if utils.koboldai_vars.model == "ReadOnly": + raise NotImplementedError("No loaded model") + + time_start = time.time() + + with use_core_manipulations(): + result = self._raw_generate( + prompt_tokens=prompt_tokens, + max_new=max_new, + batch_count=batch_count, + gen_settings=gen_settings, + single_line=single_line, + ) + + time_end = round(time.time() - time_start, 2) + tokens_per_second = round(len(result.encoded[0]) / time_end, 2) + + if not utils.koboldai_vars.quiet: + logger.info( + f"Generated {len(result.encoded[0])} tokens in {time_end} seconds, for an average rate of {tokens_per_second} tokens per second." + ) + + return result + + def generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new_tokens: int, + do_streaming: bool = False, + do_dynamic_wi: bool = False, + single_line: bool = False, + batch_count: int = 1, + ) -> torch.Tensor: + raise NotImplementedError + + def _post_token_gen(self, input_ids: torch.LongTensor) -> None: + for hook in self.post_token_hooks: + hook(self, input_ids) diff --git a/modeling/inference_models/api.py b/modeling/inference_models/api.py new file mode 100644 index 00000000..852ec01d --- /dev/null +++ b/modeling/inference_models/api.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import time +import json +import torch +import requests +import numpy as np +from typing import List, Union + +import utils +from logger import logger + +from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, +) + + +class APIException(Exception): + """To be used for errors when using the Kobold API as an interface.""" + + +class APIInferenceModel(InferenceModel): + def _load(self, save_model: bool, initial_load: bool) -> None: + tokenizer_id = requests.get( + utils.koboldai_vars.colaburl[:-8] + "/api/v1/model", + ).json()["result"] + self.tokenizer = self._get_tokenizer(tokenizer_id) + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ): + decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + utils.koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + reqdata = { + "prompt": decoded_prompt, + "max_length": max_new, + "max_context_length": utils.koboldai_vars.max_length, + "rep_pen": gen_settings.rep_pen, + "rep_pen_slope": gen_settings.rep_pen_slope, + "rep_pen_range": gen_settings.rep_pen_range, + "temperature": gen_settings.temp, + "top_p": gen_settings.top_p, + "top_k": gen_settings.top_k, + "top_a": gen_settings.top_a, + "tfs": gen_settings.tfs, + "typical": gen_settings.typical, + "n": batch_count, + } + + # Create request + while True: + req = requests.post( + utils.koboldai_vars.colaburl[:-8] + "/api/v1/generate", + json=reqdata, + ) + if ( + req.status_code == 503 + ): # Server is currently generating something else so poll until it's our turn + time.sleep(1) + continue + + js = req.json() + if req.status_code != 200: + logger.error(json.dumps(js, indent=4)) + raise APIException(f"Bad API status code {req.status_code}") + + genout = [obj["text"] for obj in js["results"]] + return GenerationResult( + model=self, + out_batches=np.array([self.tokenizer.encode(x) for x in genout]), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) diff --git a/modeling/inference_models/colab.py b/modeling/inference_models/colab.py new file mode 100644 index 00000000..87358e41 --- /dev/null +++ b/modeling/inference_models/colab.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import torch +import requests +import numpy as np +from typing import List, Union + +import utils + +from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, +) + + +class ColabException(Exception): + """To be used for errors when using the Colab API as an interface.""" + + +class ColabInferenceModel(InferenceModel): + def _load(self, save_model: bool, initial_load: bool) -> None: + self.tokenizer = self._get_tokenizer("EleutherAI/gpt-neo-2.7B") + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ): + decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + utils.koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + reqdata = { + "text": decoded_prompt, + "min": 0, + "max": max_new, + "rep_pen": gen_settings.rep_pen, + "rep_pen_slope": gen_settings.rep_pen_slope, + "rep_pen_range": gen_settings.rep_pen_range, + "temperature": gen_settings.temp, + "top_p": gen_settings.top_p, + "top_k": gen_settings.top_k, + "tfs": gen_settings.tfs, + "typical": gen_settings.typical, + "topa": gen_settings.top_a, + "numseqs": batch_count, + "retfultxt": False, + } + + # Create request + req = requests.post(utils.koboldai_vars.colaburl, json=reqdata) + + if req.status_code != 200: + raise ColabException(f"Bad status code {req.status_code}") + + # Deal with the response + js = req.json()["data"] + + # Try to be backwards compatible with outdated colab + if "text" in js: + genout = [utils.getnewcontent(js["text"], self.tokenizer)] + else: + genout = js["seqs"] + + return GenerationResult( + model=self, + out_batches=np.array([self.tokenizer.encode(x) for x in genout]), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py new file mode 100644 index 00000000..d7372814 --- /dev/null +++ b/modeling/inference_models/generic_hf_torch.py @@ -0,0 +1,262 @@ +from __future__ import annotations + +import os +import json +import torch +import shutil +from typing import Union + +from transformers import AutoModelForCausalLM, GPTNeoForCausalLM + +import utils +import breakmodel +import torch_lazy_loader +import koboldai_settings + +from modeling.inference_models.hf_torch import HFTorchInferenceModel + + +class GenericHFTorchInferenceModel(HFTorchInferenceModel): + def _load(self, save_model: bool, initial_load: bool) -> None: + utils.koboldai_vars.allowsp = True + + # Make model path the same as the model name to make this consistent + # with the other loading method if it isn't a known model type. This + # code is not just a workaround for below, it is also used to make the + # behavior consistent with other loading methods - Henk717 + # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]: + # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model + + if utils.koboldai_vars.model == "NeoCustom": + utils.koboldai_vars.model = os.path.basename( + os.path.normpath(utils.koboldai_vars.custmodpth) + ) + + # If we specify a model and it's in the root directory, we need to move + # it to the models directory (legacy folder structure to new) + if self.get_local_model_path(legacy=True): + shutil.move( + self.get_local_model_path(legacy=True, ignore_existance=True), + self.get_local_model_path(ignore_existance=True), + ) + + self.init_model_config() + + tf_kwargs = { + "low_cpu_mem_usage": True, + } + + if utils.koboldai_vars.model_type == "gpt2": + # We must disable low_cpu_mem_usage and if using a GPT-2 model + # because GPT-2 is not compatible with this feature yet. + tf_kwargs.pop("low_cpu_mem_usage", None) + + # Also, lazy loader doesn't support GPT-2 models + utils.koboldai_vars.lazy_load = False + + # If we're using torch_lazy_loader, we need to get breakmodel config + # early so that it knows where to load the individual model tensors + if ( + utils.koboldai_vars.lazy_load + and utils.koboldai_vars.hascuda + and utils.koboldai_vars.breakmodel + and not utils.koboldai_vars.nobreakmodel + ): + self.breakmodel_device_config(self.model_config) + + if utils.koboldai_vars.lazy_load: + # If we're using lazy loader, we need to figure out what the model's hidden layers are called + with torch_lazy_loader.use_lazy_torch_load( + dematerialized_modules=True, use_accelerate_init_empty_weights=True + ): + try: + metamodel = AutoModelForCausalLM.from_config(self.model_config) + except Exception as e: + metamodel = GPTNeoForCausalLM.from_config(self.model_config) + utils.layers_module_names = utils.get_layers_module_names(metamodel) + utils.module_names = list(metamodel.state_dict().keys()) + utils.named_buffers = list(metamodel.named_buffers(recurse=True)) + + # Download model from Huggingface if it does not exist, otherwise load locally + with self._maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load( + enable=utils.koboldai_vars.lazy_load, + callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) + if utils.koboldai_vars.lazy_load + else None, + dematerialized_modules=True, + ): + if utils.koboldai_vars.lazy_load: + # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time + tf_kwargs.pop("low_cpu_mem_usage", None) + + if self.get_local_model_path(): + # Model is stored locally, load it. + self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + else: + # Model not stored locally, we need to download it. + + # _rebuild_tensor patch for casting dtype and supporting LazyTensors + old_rebuild_tensor = torch._utils._rebuild_tensor + + def new_rebuild_tensor( + storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], + storage_offset, + shape, + stride, + ): + if not isinstance(storage, torch_lazy_loader.LazyTensor): + dtype = storage.dtype + else: + dtype = storage.storage_type.dtype + if not isinstance(dtype, torch.dtype): + dtype = storage.storage_type(0).dtype + if dtype is torch.float32 and len(shape) >= 2: + utils.koboldai_vars.fp32_model = True + return old_rebuild_tensor(storage, storage_offset, shape, stride) + + torch._utils._rebuild_tensor = new_rebuild_tensor + self.model = self._get_model(utils.koboldai_vars.model, tf_kwargs) + self.tokenizer = self._get_tokenizer(utils.koboldai_vars.model) + torch._utils._rebuild_tensor = old_rebuild_tensor + + if save_model: + self.tokenizer.save_pretrained( + self.get_local_model_path(ignore_existance=True) + ) + + if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks: + # Use save_pretrained to convert fp32 models to fp16, + # unless we are using disk cache because save_pretrained + # is not supported in that case + model = model.half() + model.save_pretrained( + self.get_local_model_path(ignore_existance=True), + max_shard_size="500MiB", + ) + + else: + # For fp16 models, we can just copy the model files directly + import transformers.configuration_utils + import transformers.modeling_utils + import transformers.file_utils + import huggingface_hub + + # Save the config.json + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + utils.koboldai_vars.model, + transformers.configuration_utils.CONFIG_NAME, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.configuration_utils.CONFIG_NAME, + ), + ) + + if utils.num_shards is None: + # Save the pytorch_model.bin or model.safetensors of an unsharded model + for possible_weight_name in [ + transformers.modeling_utils.WEIGHTS_NAME, + "model.safetensors", + ]: + try: + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + utils.koboldai_vars.model, + possible_weight_name, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path( + ignore_existance=True + ), + possible_weight_name, + ), + ) + except Exception: + if possible_weight_name == "model.safetensors": + raise + else: + # Handle saving sharded models + + with open(utils.from_pretrained_index_filename) as f: + map_data = json.load(f) + filenames = set(map_data["weight_map"].values()) + # Save the pytorch_model.bin.index.json of a sharded model + shutil.move( + os.path.realpath(utils.from_pretrained_index_filename), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.modeling_utils.WEIGHTS_INDEX_NAME, + ), + ) + # Then save the pytorch_model-#####-of-#####.bin files + for filename in filenames: + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + utils.koboldai_vars.model, + filename, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path( + ignore_existance=True + ), + filename, + ), + ) + shutil.rmtree("cache/") + + if ( + utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default + and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj") + ): + utils.koboldai_vars.badwordsids = [ + [v] + for k, v in self.tokenizer.get_vocab().items() + if any(c in str(k) for c in "<>[]") + if utils.koboldai_vars.newlinemode != "s" or str(k) != "" + ] + + self.patch_embedding() + + if utils.koboldai_vars.hascuda: + if utils.koboldai_vars.usegpu: + # Use just VRAM + self.model = self.model.half().to(utils.koboldai_vars.gpu_device) + elif utils.koboldai_vars.breakmodel: + # Use both RAM and VRAM (breakmodel) + if not utils.koboldai_vars.lazy_load: + self.breakmodel_device_config(model.config) + self._move_to_devices() + elif breakmodel.disk_blocks > 0: + # Use disk + self._move_to_devices() + elif breakmodel.disk_blocks > 0: + self._move_to_devices() + else: + # Use CPU + self.model = self.model.to("cpu").float() + elif breakmodel.disk_blocks > 0: + self._move_to_devices() + else: + self.model = self.model.to("cpu").float() + self.model.kai_model = self + utils.koboldai_vars.modeldim = self.get_hidden_size() diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py new file mode 100644 index 00000000..dca2406e --- /dev/null +++ b/modeling/inference_models/hf.py @@ -0,0 +1,52 @@ +import os +from typing import Optional +from transformers import AutoConfig + +import utils +from logger import logger +from modeling.inference_model import InferenceModel + + +class HFInferenceModel(InferenceModel): + def __init__(self) -> None: + super().__init__() + self.model_config = None + + def get_local_model_path( + self, legacy: bool = False, ignore_existance: bool = False + ) -> Optional[str]: + """ + Returns a string of the model's path locally, or None if it is not downloaded. + If ignore_existance is true, it will always return a path. + """ + + basename = utils.koboldai_vars.model.replace("/", "_") + if legacy: + ret = basename + else: + ret = os.path.join("models", basename) + + if os.path.isdir(ret) or ignore_existance: + return ret + return None + + def init_model_config(self) -> None: + # Get the model_type from the config or assume a model type if it isn't present + try: + self.model_config = AutoConfig.from_pretrained( + self.get_local_model_path() or utils.koboldai_vars.model, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + ) + utils.koboldai_vars.model_type = self.model_config.model_type + except ValueError: + utils.koboldai_vars.model_type = { + "NeoCustom": "gpt_neo", + "GPT2Custom": "gpt2", + }.get(utils.koboldai_vars.model) + + if not utils.koboldai_vars.model_type: + logger.warning( + "No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)" + ) + utils.koboldai_vars.model_type = "gpt_neo" diff --git a/modeling/inference_models/hf_mtj.py b/modeling/inference_models/hf_mtj.py new file mode 100644 index 00000000..19fed474 --- /dev/null +++ b/modeling/inference_models/hf_mtj.py @@ -0,0 +1,289 @@ +from __future__ import annotations + +import os +import torch +import numpy as np +from eventlet import tpool +from typing import List, Tuple, Union + +import utils +import koboldai_settings +from logger import logger, Colors + +from modeling.inference_model import ModelCapabilities +from modeling.inference_models.hf import HFInferenceModel + +try: + import tpu_mtj_backend +except ModuleNotFoundError as e: + # Not on TPU... hopefully + if utils.koboldai_vars.use_colab_tpu: + raise e + + +class HFMTJInferenceModel(HFInferenceModel): + def __init__( + self, + model_name: str, + ) -> None: + super().__init__() + + self.model_name = model_name + + self.model = None + self.tokenizer = None + self.model_config = None + self.capabilties = ModelCapabilities( + embedding_manipulation=False, + post_token_hooks=False, + stopper_hooks=False, + post_token_probs=False, + ) + + def setup_mtj(self) -> None: + def mtj_warper_callback(scores) -> "np.array": + scores_shape = scores.shape + scores_list = scores.tolist() + utils.koboldai_vars.lua_koboldbridge.logits = ( + utils.koboldai_vars.lua_state.table() + ) + for r, row in enumerate(scores_list): + utils.koboldai_vars.lua_koboldbridge.logits[ + r + 1 + ] = utils.koboldai_vars.lua_state.table(*row) + utils.koboldai_vars.lua_koboldbridge.vocab_size = scores_shape[-1] + + utils.koboldai_vars.lua_koboldbridge.execute_genmod() + + scores = np.array( + tuple( + tuple(row.values()) + for row in utils.koboldai_vars.lua_koboldbridge.logits.values() + ), + dtype=scores.dtype, + ) + assert scores.shape == scores_shape + + return scores + + def mtj_stopping_callback( + generated, n_generated, excluded_world_info + ) -> Tuple[List[set], bool, bool]: + utils.koboldai_vars.generated_tkns += 1 + + assert len(excluded_world_info) == len(generated) + regeneration_required = ( + utils.koboldai_vars.lua_koboldbridge.regeneration_required + ) + halt = ( + utils.koboldai_vars.abort + or not utils.koboldai_vars.lua_koboldbridge.generating + or utils.koboldai_vars.generated_tkns >= utils.koboldai_vars.genamt + ) + utils.koboldai_vars.lua_koboldbridge.regeneration_required = False + + # Not sure what the deal is with this variable. It's been undefined + # as far back as I can trace it. + global past + + for i in range(utils.koboldai_vars.numseqs): + utils.koboldai_vars.lua_koboldbridge.generated[i + 1][ + utils.koboldai_vars.generated_tkns + ] = int( + generated[i, tpu_mtj_backend.params["seq"] + n_generated - 1].item() + ) + + if not utils.koboldai_vars.dynamicscan or halt: + return excluded_world_info, regeneration_required, halt + + for i, t in enumerate(generated): + decoded = utils.decodenewlines( + self.tokenizer.decode(past[i]) + ) + utils.decodenewlines( + self.tokenizer.decode( + t[ + tpu_mtj_backend.params["seq"] : tpu_mtj_backend.params[ + "seq" + ] + + n_generated + ] + ) + ) + # _, found = checkworldinfo(decoded, force_use_txt=True, actions=koboldai_vars.actions) + _, _, _, found = utils.koboldai_vars.calc_ai_text( + submitted_text=decoded + ) + found -= excluded_world_info[i] + if len(found) != 0: + regeneration_required = True + break + return excluded_world_info, regeneration_required, halt + + def mtj_compiling_callback() -> None: + print(Colors.GREEN + "TPU backend compilation triggered" + Colors.END) + utils.koboldai_vars.compiling = True + + def mtj_stopped_compiling_callback() -> None: + print(Colors.GREEN + "TPU backend compilation stopped" + Colors.END) + utils.koboldai_vars.compiling = False + + def mtj_settings_callback() -> dict: + sampler_order = utils.koboldai_vars.sampler_order[:] + if ( + len(sampler_order) < 7 + ): # Add repetition penalty at beginning if it's not present + sampler_order = [6] + sampler_order + return { + "sampler_order": utils.koboldai_vars.sampler_order, + "top_p": float(utils.koboldai_vars.top_p), + "temp": float(utils.koboldai_vars.temp), + "top_k": int(utils.koboldai_vars.top_k), + "tfs": float(utils.koboldai_vars.tfs), + "typical": float(utils.koboldai_vars.typical), + "top_a": float(utils.koboldai_vars.top_a), + "repetition_penalty": float(utils.koboldai_vars.rep_pen), + "rpslope": float(utils.koboldai_vars.rep_pen_slope), + "rprange": int(utils.koboldai_vars.rep_pen_range), + } + + tpu_mtj_backend.socketio = utils.socketio + + if utils.koboldai_vars.model == "TPUMeshTransformerGPTNeoX": + utils.koboldai_vars.badwordsids = utils.koboldai_vars.badwordsids_neox + + print( + "{0}Initializing Mesh Transformer JAX, please wait...{1}".format( + Colors.PURPLE, Colors.END + ) + ) + if utils.koboldai_vars.model in ( + "TPUMeshTransformerGPTJ", + "TPUMeshTransformerGPTNeoX", + ) and ( + not utils.koboldai_vars.custmodpth + or not os.path.isdir(utils.koboldai_vars.custmodpth) + ): + raise FileNotFoundError( + f"The specified model path {repr(utils.koboldai_vars.custmodpth)} is not the path to a valid folder" + ) + if utils.koboldai_vars.model == "TPUMeshTransformerGPTNeoX": + tpu_mtj_backend.pad_token_id = 2 + + tpu_mtj_backend.koboldai_vars = utils.koboldai_vars + tpu_mtj_backend.warper_callback = mtj_warper_callback + tpu_mtj_backend.stopping_callback = mtj_stopping_callback + tpu_mtj_backend.compiling_callback = mtj_compiling_callback + tpu_mtj_backend.stopped_compiling_callback = mtj_stopped_compiling_callback + tpu_mtj_backend.settings_callback = mtj_settings_callback + + def _load(self, save_model: bool, initial_load: bool) -> None: + self.setup_mtj() + self.init_model_config() + utils.koboldai_vars.allowsp = True + + tpu_mtj_backend.load_model( + utils.koboldai_vars.model, + hf_checkpoint=utils.koboldai_vars.model + not in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") + and utils.koboldai_vars.use_colab_tpu, + socketio_queue=koboldai_settings.queue, + initial_load=initial_load, + logger=logger, + **self.model_config.to_dict(), + ) + + utils.koboldai_vars.modeldim = int( + tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"]) + ) + + self.tokenizer = tpu_mtj_backend.tokenizer + if ( + utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default + and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj") + ): + utils.koboldai_vars.badwordsids = [ + [v] + for k, v in self.tokenizer.get_vocab().items() + if any(c in str(k) for c in "<>[]") + if utils.koboldai_vars.newlinemode != "s" or str(k) != "" + ] + + def get_soft_tokens(self) -> np.array: + soft_tokens = None + + if utils.koboldai_vars.sp is None: + tensor = np.zeros( + ( + 1, + tpu_mtj_backend.params.get( + "d_embed", tpu_mtj_backend.params["d_model"] + ), + ), + dtype=np.float32, + ) + rows = tensor.shape[0] + padding_amount = ( + tpu_mtj_backend.params["seq"] + - ( + tpu_mtj_backend.params["seq"] + % -tpu_mtj_backend.params["cores_per_replica"] + ) + - rows + ) + tensor = np.pad(tensor, ((0, padding_amount), (0, 0))) + tensor = tensor.reshape( + tpu_mtj_backend.params["cores_per_replica"], + -1, + tpu_mtj_backend.params.get( + "d_embed", tpu_mtj_backend.params["d_model"] + ), + ) + utils.koboldai_vars.sp = tpu_mtj_backend.shard_xmap(tensor) + + soft_tokens = np.arange( + tpu_mtj_backend.params["n_vocab"] + + tpu_mtj_backend.params["n_vocab_padding"], + tpu_mtj_backend.params["n_vocab"] + + tpu_mtj_backend.params["n_vocab_padding"] + + utils.koboldai_vars.sp_length, + dtype=np.uint32, + ) + return soft_tokens + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ) -> GenerationResult: + soft_tokens = self.get_soft_tokens() + + genout = tpool.execute( + tpu_mtj_backend.infer_static, + np.uint32(prompt_tokens), + gen_len=max_new, + temp=gen_settings.temp, + top_p=gen_settings.top_p, + top_k=gen_settings.top_k, + tfs=gen_settings.tfs, + typical=gen_settings.typical, + top_a=gen_settings.top_a, + numseqs=batch_count, + repetition_penalty=gen_settings.rep_pen, + rpslope=gen_settings.rep_pen_slope, + rprange=gen_settings.rep_pen_range, + soft_embeddings=utils.koboldai_vars.sp, + soft_tokens=soft_tokens, + sampler_order=gen_settings.sampler_order, + ) + genout = np.array(genout) + + return GenerationResult( + self, + out_batches=genout, + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py new file mode 100644 index 00000000..ee83259d --- /dev/null +++ b/modeling/inference_models/hf_torch.py @@ -0,0 +1,1053 @@ +from __future__ import annotations + +import gc +import os +import time +import bisect +import zipfile +import functools +import itertools +import traceback +import contextlib +from tqdm.auto import tqdm +from typing import Dict, List, Union + +import torch +from torch.nn import Embedding +import transformers +from transformers import ( + StoppingCriteria, + GPTNeoForCausalLM, + AutoModelForCausalLM, + LogitsProcessorList, + LogitsProcessor, +) + +import utils +import torch_lazy_loader +from logger import logger, Colors + +from modeling import warpers +from modeling import inference_model +from modeling.warpers import Warper +from modeling.stoppers import Stoppers +from modeling.post_token_hooks import PostTokenHooks +from modeling.inference_models.hf import HFInferenceModel +from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, + ModelCapabilities, + use_core_manipulations, +) + +try: + import breakmodel + import accelerate.utils +except ModuleNotFoundError as e: + if not utils.koboldai_vars.use_colab_tpu: + raise e + + +class HFTorchInferenceModel(HFInferenceModel): + def __init__( + self, + model_name: str, + lazy_load: bool, + low_mem: bool, + ) -> None: + super().__init__() + + self.model_name = model_name + self.lazy_load = lazy_load + self.low_mem = low_mem + + self.post_token_hooks = [ + Stoppers.core_stopper, + PostTokenHooks.stream_tokens, + Stoppers.dynamic_wi_scanner, + Stoppers.chat_mode_stopper, + ] + + self.model = None + self.tokenizer = None + self.capabilties = ModelCapabilities( + embedding_manipulation=True, + post_token_hooks=True, + stopper_hooks=True, + post_token_probs=True, + ) + self._old_stopping_criteria = None + + def _apply_warpers( + self, scores: torch.Tensor, input_ids: torch.Tensor + ) -> torch.Tensor: + warpers.update_settings() + for sid in utils.koboldai_vars.sampler_order: + warper = Warper.from_id(sid) + if warper == warpers.RepetitionPenalty: + # Rep pen needs more data than other samplers + scores = warper.torch(scores, input_ids=input_ids) + else: + scores = warper.torch(scores) + return scores + + def _post_load(self) -> None: + # Patch stopping_criteria + + class PTHStopper(StoppingCriteria): + def __call__( + hf_self, + input_ids: torch.LongTensor, + scores: torch.FloatTensor, + ) -> None: + self._post_token_gen(input_ids) + + for stopper in self.stopper_hooks: + do_stop = stopper(input_ids) + if do_stop: + return True + return False + + old_gsc = transformers.GenerationMixin._get_stopping_criteria + + def _get_stopping_criteria( + hf_self, + *args, + **kwargs, + ): + stopping_criteria = old_gsc(hf_self, *args, **kwargs) + stopping_criteria.insert(0, PTHStopper()) + return stopping_criteria + + use_core_manipulations.get_stopping_criteria = _get_stopping_criteria + + # Patch logitswarpers + + class PhraseBiasLogitsProcessor(LogitsProcessor): + def __init__(self): + pass + + def _find_intersection(self, big: List, small: List) -> int: + """Find the maximum overlap between the beginning of small and the end of big. + Return the index of the token in small following the overlap, or 0. + + big: The tokens in the context (as a tensor) + small: The tokens in the phrase to bias (as a list) + + Both big and small are in "oldest to newest" order. + """ + # There are asymptotically more efficient methods for determining the overlap, + # but typically there will be few (0-1) instances of small[0] in the last len(small) + # elements of big, plus small will typically be fairly short. So this naive + # approach is acceptable despite O(N^2) worst case performance. + + num_small = len(small) + # The small list can only ever match against at most num_small tokens of big, + # so create a slice. Typically, this slice will be as long as small, but it + # may be shorter if the story has just started. + # We need to convert the big slice to list, since natively big is a tensor + # and tensor and list don't ever compare equal. It's better to convert here + # and then use native equality tests than to iterate repeatedly later. + big_slice = list(big[-num_small:]) + + # It's possible that the start token appears multiple times in small + # For example, consider the phrase: + # [ fair is foul, and foul is fair, hover through the fog and filthy air] + # If we merely look for the first instance of [ fair], then we would + # generate the following output: + # " fair is foul, and foul is fair is foul, and foul is fair..." + start = small[0] + for i, t in enumerate(big_slice): + # Strictly unnecessary, but it's marginally faster to test the first + # token before creating slices to test for a full match. + if t == start: + remaining = len(big_slice) - i + if big_slice[i:] == small[:remaining]: + # We found a match. If the small phrase has any remaining tokens + # then return the index of the next token. + if remaining < num_small: + return remaining + # In this case, the entire small phrase matched, so start over. + return 0 + + # There were no matches, so just begin at the beginning. + return 0 + + def _allow_leftwards_tampering(self, phrase: str) -> bool: + """Determines if a phrase should be tampered with from the left in + the "soft" token encoding mode.""" + + if phrase[0] in [".", "?", "!", ";", ":", "\n"]: + return False + return True + + def _get_token_sequence(self, phrase: str) -> List[List]: + """Convert the phrase string into a list of encoded biases, each + one being a list of tokens. How this is done is determined by the + phrase's format: + + - If the phrase is surrounded by square brackets ([]), the tokens + will be the phrase split by commas (,). If a "token" isn't + actually a number, it will be skipped. NOTE: Tokens output by + this may not be in the model's vocabulary, and such tokens + should be ignored later in the pipeline. + - If the phrase is surrounded by curly brackets ({}), the phrase + will be directly encoded with no synonym biases and no fancy + tricks. + - Otherwise, the phrase will be encoded, with close deviations + being included as synonym biases. + """ + + # TODO: Cache these tokens, invalidate when model or bias is + # changed. + + # Handle direct token id input + if phrase.startswith("[") and phrase.endswith("]"): + no_brackets = phrase[1:-1] + ret = [] + for token_id in no_brackets.split(","): + try: + ret.append(int(token_id)) + except ValueError: + # Ignore non-numbers. Rascals! + pass + return [ret] + + # Handle direct phrases + if phrase.startswith("{") and phrase.endswith("}"): + no_brackets = phrase[1:-1] + return [inference_model.current_model.tokenizer.encode(no_brackets)] + + # Handle untamperable phrases + if not self._allow_leftwards_tampering(phrase): + return [inference_model.current_model.tokenizer.encode(phrase)] + + # Handle slight alterations to original phrase + phrase = phrase.strip(" ") + ret = [] + + for alt_phrase in [phrase, f" {phrase}"]: + ret.append( + inference_model.current_model.tokenizer.encode(alt_phrase) + ) + + return ret + + def _get_biased_tokens(self, input_ids: List) -> Dict: + # TODO: Different "bias slopes"? + + ret = {} + for phrase, _bias in utils.koboldai_vars.biases.items(): + bias_score, completion_threshold = _bias + token_seqs = self._get_token_sequence(phrase) + variant_deltas = {} + for token_seq in token_seqs: + bias_index = self._find_intersection(input_ids, token_seq) + + # Ensure completion after completion_threshold tokens + # Only provide a positive bias when the base bias score is positive. + if bias_score > 0 and bias_index + 1 > completion_threshold: + bias_score = 999 + + token_to_bias = token_seq[bias_index] + variant_deltas[token_to_bias] = bias_score + + # If multiple phrases bias the same token, add the modifiers + # together. This should NOT be applied to automatic variants + for token_to_bias, bias_score in variant_deltas.items(): + if token_to_bias in ret: + ret[token_to_bias] += bias_score + else: + ret[token_to_bias] = bias_score + return ret + + def __call__( + self, input_ids: torch.LongTensor, scores: torch.FloatTensor + ) -> torch.FloatTensor: + assert scores.ndim == 2 + assert input_ids.ndim == 2 + + scores_shape = scores.shape + + for batch in range(scores_shape[0]): + for token, bias in self._get_biased_tokens( + input_ids[batch] + ).items(): + scores[batch][token] += bias + + return scores + + class LuaLogitsProcessor(LogitsProcessor): + def __init__(self): + pass + + def __call__( + self, input_ids: torch.LongTensor, scores: torch.FloatTensor + ) -> torch.FloatTensor: + assert scores.ndim == 2 + assert input_ids.ndim == 2 + self.regeneration_required = False + self.halt = False + + if utils.koboldai_vars.standalone: + return scores + + scores_shape = scores.shape + scores_list = scores.tolist() + utils.koboldai_vars.lua_koboldbridge.logits = ( + utils.koboldai_vars.lua_state.table() + ) + for r, row in enumerate(scores_list): + utils.koboldai_vars.lua_koboldbridge.logits[ + r + 1 + ] = utils.koboldai_vars.lua_state.table(*row) + utils.koboldai_vars.lua_koboldbridge.vocab_size = scores_shape[-1] + + utils.koboldai_vars.lua_koboldbridge.execute_genmod() + + scores = torch.Tensor( + tuple( + tuple(row.values()) + for row in utils.koboldai_vars.lua_koboldbridge.logits.values() + ), + device=scores.device, + dtype=scores.dtype, + ) + assert scores.shape == scores_shape + + return scores + + from torch.nn import functional as F + + def visualize_probabilities( + model: InferenceModel, + scores: torch.FloatTensor, + ) -> None: + assert scores.ndim == 2 + + if utils.koboldai_vars.numseqs > 1 or not utils.koboldai_vars.show_probs: + return + + if not utils.koboldai_vars.show_probs: + return scores + + option_offset = 0 + if ( + utils.koboldai_vars.actions.action_count + 1 + in utils.koboldai_vars.actions.actions + ): + for x in range( + len( + utils.koboldai_vars.actions.actions[ + utils.koboldai_vars.actions.action_count + 1 + ]["Options"] + ) + ): + option = utils.koboldai_vars.actions.actions[ + utils.koboldai_vars.actions.action_count + 1 + ]["Options"][x] + if ( + option["Pinned"] + or option["Previous Selection"] + or option["Edited"] + ): + option_offset = x + 1 + batch_offset = ( + int( + (utils.koboldai_vars.generated_tkns - 1) + / utils.koboldai_vars.genamt + ) + if utils.koboldai_vars.alt_multi_gen + else 0 + ) + for batch_index, batch in enumerate(scores): + probs = F.softmax(batch, dim=-1).cpu().numpy() + + token_prob_info = [] + for token_id, score in sorted( + enumerate(probs), key=lambda x: x[1], reverse=True + )[:8]: + token_prob_info.append( + { + "tokenId": token_id, + "decoded": utils.decodenewlines( + model.tokenizer.decode(token_id) + ), + "score": float(score), + } + ) + + if utils.koboldai_vars.numseqs == 1: + utils.koboldai_vars.actions.set_probabilities(token_prob_info) + else: + utils.koboldai_vars.actions.set_option_probabilities( + token_prob_info, batch_index + option_offset + batch_offset + ) + + return scores + + def new_get_logits_processor(*args, **kwargs) -> LogitsProcessorList: + processors = new_get_logits_processor.old_get_logits_processor( + *args, **kwargs + ) + # TODOB4MERGE: These two + # processors.insert(0, LuaLogitsProcessor()) + # processors.append(PhraseBiasLogitsProcessor()) + return processors + + use_core_manipulations.get_logits_processor = new_get_logits_processor + new_get_logits_processor.old_get_logits_processor = ( + transformers.GenerationMixin._get_logits_processor + ) + + class KoboldLogitsWarperList(LogitsProcessorList): + def __init__(self): + pass + + def __call__( + lw_self, + input_ids: torch.LongTensor, + scores: torch.FloatTensor, + *args, + **kwargs, + ): + # sampler_order = utils.koboldai_vars.sampler_order[:] + # if ( + # len(sampler_order) < 7 + # ): # Add repetition penalty at beginning if it's not present + # sampler_order = [6] + sampler_order + # for k in sampler_order: + # scores = self.__warper_list[k](input_ids, scores, *args, **kwargs) + scores = self._apply_warpers(scores=scores, input_ids=input_ids) + visualize_probabilities(inference_model.current_model, scores) + return scores + + def new_get_logits_warper( + beams: int = 1, + ) -> LogitsProcessorList: + return KoboldLogitsWarperList() + + def new_sample(self, *args, **kwargs): + assert kwargs.pop("logits_warper", None) is not None + kwargs["logits_warper"] = new_get_logits_warper( + beams=1, + ) + if utils.koboldai_vars.newlinemode in ["s", "ns"]: + kwargs["eos_token_id"] = -1 + kwargs.setdefault("pad_token_id", 2) + return new_sample.old_sample(self, *args, **kwargs) + + new_sample.old_sample = transformers.GenerationMixin.sample + use_core_manipulations.sample = new_sample + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ) -> GenerationResult: + if not isinstance(prompt_tokens, torch.Tensor): + gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] + else: + gen_in = prompt_tokens + + device = utils.get_auxilary_device() + gen_in = gen_in.to(device) + + additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else [] + + with torch.no_grad(): + start_time = time.time() + genout = self.model.generate( + gen_in, + do_sample=True, + max_length=min( + len(prompt_tokens) + max_new, utils.koboldai_vars.max_length + ), + repetition_penalty=1.0, + bad_words_ids=utils.koboldai_vars.badwordsids + + additional_bad_words_ids, + use_cache=True, + num_return_sequences=batch_count, + ) + logger.debug( + "torch_raw_generate: run generator {}s".format(time.time() - start_time) + ) + + return GenerationResult( + self, + out_batches=genout, + prompt=prompt_tokens, + is_whole_generation=False, + output_includes_prompt=True, + ) + + def _get_model(self, location: str, tf_kwargs: Dict): + try: + return AutoModelForCausalLM.from_pretrained( + location, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + **tf_kwargs, + ) + except Exception as e: + if "out of memory" in traceback.format_exc().lower(): + raise RuntimeError( + "One of your GPUs ran out of memory when KoboldAI tried to load your model." + ) + return GPTNeoForCausalLM.from_pretrained( + location, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + **tf_kwargs, + ) + + def get_hidden_size(self) -> int: + return self.model.get_input_embeddings().embedding_dim + + def _move_to_devices(self) -> None: + if not utils.koboldai_vars.breakmodel: + if utils.koboldai_vars.usegpu: + self.model = self.model.half().to(utils.koboldai_vars.gpu_device) + else: + self.model = self.model.to("cpu").float() + return + + for key, value in self.model.state_dict().items(): + target_dtype = ( + torch.float32 if breakmodel.primary_device == "cpu" else torch.float16 + ) + if value.dtype is not target_dtype: + accelerate.utils.set_module_tensor_to_device( + self.model, key, target_dtype + ) + + disk_blocks = breakmodel.disk_blocks + gpu_blocks = breakmodel.gpu_blocks + ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks) + cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) + device_map = {} + + for name in utils.layers_module_names: + layer = int(name.rsplit(".", 1)[1]) + device = ( + ("disk" if layer < disk_blocks else "cpu") + if layer < ram_blocks + else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) + ) + device_map[name] = device + + for name in utils.get_missing_module_names(self.model, list(device_map.keys())): + device_map[name] = breakmodel.primary_device + + breakmodel.dispatch_model_ex( + self.model, + device_map, + main_device=breakmodel.primary_device, + offload_buffers=True, + offload_dir="accelerate-disk-cache", + ) + + gc.collect() + return + + # Function to patch transformers to use our soft prompt + def patch_embedding(self) -> None: + if getattr(Embedding, "_koboldai_patch_causallm_model", None): + Embedding._koboldai_patch_causallm_model = self.model + return + + old_embedding_call = Embedding.__call__ + + kai_model = self + + def new_embedding_call(self, input_ids, *args, **kwargs): + # Don't touch embeddings for models other than the core inference model (that's us!) + if ( + Embedding._koboldai_patch_causallm_model.get_input_embeddings() + is not self + ): + return old_embedding_call(self, input_ids, *args, **kwargs) + + assert input_ids is not None + + if utils.koboldai_vars.sp is not None: + shifted_input_ids = input_ids - kai_model.model.config.vocab_size + + input_ids.clamp_(max=kai_model.model.config.vocab_size - 1) + inputs_embeds = old_embedding_call(self, input_ids, *args, **kwargs) + + if utils.koboldai_vars.sp is not None: + utils.koboldai_vars.sp = utils.koboldai_vars.sp.to( + inputs_embeds.dtype + ).to(inputs_embeds.device) + inputs_embeds = torch.where( + (shifted_input_ids >= 0)[..., None], + utils.koboldai_vars.sp[shifted_input_ids.clamp(min=0)], + inputs_embeds, + ) + + return inputs_embeds + + Embedding.__call__ = new_embedding_call + Embedding._koboldai_patch_causallm_model = self.model + + def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True): + if not self.lazy_load: + return + + if utils.args.breakmodel_disklayers is not None: + breakmodel.disk_blocks = utils.args.breakmodel_disklayers + + disk_blocks = breakmodel.disk_blocks + gpu_blocks = breakmodel.gpu_blocks + ram_blocks = ram_blocks = n_layers - sum(gpu_blocks) + cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) + + def lazy_load_callback( + model_dict: Dict[str, Union[torch_lazy_loader.LazyTensor, torch.Tensor]], + f, + **_, + ): + if lazy_load_callback.nested: + return + lazy_load_callback.nested = True + + device_map: Dict[str, Union[str, int]] = {} + + @functools.lru_cache(maxsize=None) + def get_original_key(key): + return max( + ( + original_key + for original_key in utils.module_names + if original_key.endswith(key) + ), + key=len, + ) + + for key, value in model_dict.items(): + original_key = get_original_key(key) + if isinstance(value, torch_lazy_loader.LazyTensor) and not any( + original_key.startswith(n) for n in utils.layers_module_names + ): + device_map[key] = ( + utils.koboldai_vars.gpu_device + if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu + else "cpu" + if not utils.koboldai_vars.hascuda + or not utils.koboldai_vars.breakmodel + else breakmodel.primary_device + ) + else: + layer = int( + max( + ( + n + for n in utils.layers_module_names + if original_key.startswith(n) + ), + key=len, + ).rsplit(".", 1)[1] + ) + device = ( + utils.koboldai_vars.gpu_device + if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu + else "disk" + if layer < disk_blocks and layer < ram_blocks + else "cpu" + if not utils.koboldai_vars.hascuda + or not utils.koboldai_vars.breakmodel + else "shared" + if layer < ram_blocks + else bisect.bisect_right( + cumulative_gpu_blocks, layer - ram_blocks + ) + ) + device_map[key] = device + + if utils.num_shards is None or utils.current_shard == 0: + utils.offload_index = {} + if os.path.isdir("accelerate-disk-cache"): + # Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder + # (the folder doesn't contain any subfolders so os.remove will do just fine) + for filename in os.listdir("accelerate-disk-cache"): + try: + os.remove(os.path.join("accelerate-disk-cache", filename)) + except OSError: + pass + os.makedirs("accelerate-disk-cache", exist_ok=True) + if utils.num_shards is not None: + num_tensors = len( + utils.get_sharded_checkpoint_num_tensors( + utils.from_pretrained_model_name, + utils.from_pretrained_index_filename, + **utils.from_pretrained_kwargs, + ) + ) + else: + num_tensors = len(device_map) + print(flush=True) + utils.koboldai_vars.status_message = "Loading model" + utils.koboldai_vars.total_layers = num_tensors + utils.koboldai_vars.loaded_layers = 0 + utils.bar = tqdm( + total=num_tensors, + desc="Loading model tensors", + file=utils.UIProgressBarFile(), + ) + + with zipfile.ZipFile(f, "r") as z: + try: + last_storage_key = None + zipfolder = os.path.basename(os.path.normpath(f)).split(".")[0] + f = None + current_offset = 0 + able_to_pin_layers = True + if utils.num_shards is not None: + utils.current_shard += 1 + for key in sorted( + device_map.keys(), + key=lambda k: (model_dict[k].key, model_dict[k].seek_offset), + ): + storage_key = model_dict[key].key + if ( + storage_key != last_storage_key + or model_dict[key].seek_offset < current_offset + ): + last_storage_key = storage_key + if isinstance(f, zipfile.ZipExtFile): + f.close() + try: + f = z.open(f"archive/data/{storage_key}") + except: + f = z.open(f"{zipfolder}/data/{storage_key}") + current_offset = 0 + if current_offset != model_dict[key].seek_offset: + f.read(model_dict[key].seek_offset - current_offset) + current_offset = model_dict[key].seek_offset + device = device_map[key] + size = functools.reduce( + lambda x, y: x * y, model_dict[key].shape, 1 + ) + dtype = model_dict[key].dtype + nbytes = ( + size + if dtype is torch.bool + else size + * ( + ( + torch.finfo + if dtype.is_floating_point + else torch.iinfo + )(dtype).bits + >> 3 + ) + ) + # print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) + model_dict[key] = model_dict[key].materialize( + f, map_location="cpu" + ) + if model_dict[key].dtype is torch.float32: + utils.koboldai_vars.fp32_model = True + if ( + convert_to_float16 + and breakmodel.primary_device != "cpu" + and utils.koboldai_vars.hascuda + and ( + utils.koboldai_vars.breakmodel + or utils.koboldai_vars.usegpu + ) + and model_dict[key].dtype is torch.float32 + ): + model_dict[key] = model_dict[key].to(torch.float16) + if breakmodel.primary_device == "cpu" or ( + not utils.koboldai_vars.usegpu + and not utils.koboldai_vars.breakmodel + and model_dict[key].dtype is torch.float16 + ): + model_dict[key] = model_dict[key].to(torch.float32) + if device == "shared": + model_dict[key] = model_dict[key].to("cpu").detach_() + if able_to_pin_layers: + try: + model_dict[key] = model_dict[key].pin_memory() + except: + able_to_pin_layers = False + elif device == "disk": + accelerate.utils.offload_weight( + model_dict[key], + get_original_key(key), + "accelerate-disk-cache", + index=utils.offload_index, + ) + model_dict[key] = model_dict[key].to("meta") + else: + model_dict[key] = model_dict[key].to(device) + # print("OK", flush=True) + current_offset += nbytes + utils.bar.update(1) + utils.koboldai_vars.loaded_layers += 1 + finally: + if ( + utils.num_shards is None + or utils.current_shard >= utils.num_shards + ): + if utils.offload_index: + for name, tensor in utils.named_buffers: + dtype = tensor.dtype + if ( + convert_to_float16 + and breakmodel.primary_device != "cpu" + and utils.koboldai_vars.hascuda + and ( + utils.koboldai_vars.breakmodel + or utils.koboldai_vars.usegpu + ) + ): + dtype = torch.float16 + if breakmodel.primary_device == "cpu" or ( + not utils.koboldai_vars.usegpu + and not utils.koboldai_vars.breakmodel + ): + dtype = torch.float32 + if ( + name in model_dict + and model_dict[name].dtype is not dtype + ): + model_dict[name] = model_dict[name].to(dtype) + if tensor.dtype is not dtype: + tensor = tensor.to(dtype) + if name not in utils.offload_index: + accelerate.utils.offload_weight( + tensor, + name, + "accelerate-disk-cache", + index=utils.offload_index, + ) + accelerate.utils.save_offload_index( + utils.offload_index, "accelerate-disk-cache" + ) + utils.bar.close() + utils.bar = None + utils.koboldai_vars.status_message = "" + lazy_load_callback.nested = False + if isinstance(f, zipfile.ZipExtFile): + f.close() + + lazy_load_callback.nested = False + return lazy_load_callback + + @contextlib.contextmanager + def _maybe_use_float16(self, always_use: bool = False): + if always_use or ( + utils.koboldai_vars.hascuda + and self.low_mem + and (utils.koboldai_vars.usegpu or utils.koboldai_vars.breakmodel) + ): + original_dtype = torch.get_default_dtype() + torch.set_default_dtype(torch.float16) + yield True + torch.set_default_dtype(original_dtype) + else: + yield False + + def breakmodel_device_list(self, n_layers, primary=None, selected=None): + # TODO: Find a better place for this or rework this + + device_count = torch.cuda.device_count() + if device_count < 2: + primary = None + gpu_blocks = breakmodel.gpu_blocks + ( + device_count - len(breakmodel.gpu_blocks) + ) * [0] + print(f"{Colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{Colors.END}") + for i in range(device_count): + name = torch.cuda.get_device_name(i) + if len(name) > 47: + name = "..." + name[-44:] + row_color = Colors.END + sep_color = Colors.YELLOW + print( + f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{Colors.END}" + ) + row_color = Colors.END + sep_color = Colors.YELLOW + print( + f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}" + ) + print( + f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){Colors.END}" + ) + + def breakmodel_device_config(self, config): + # TODO: Find a better place for this or rework this + + global breakmodel, generator + import breakmodel + + n_layers = utils.num_layers(config) + + if utils.args.cpu: + breakmodel.gpu_blocks = [0] * n_layers + return + + elif ( + utils.args.breakmodel_gpulayers is not None + or utils.args.breakmodel_disklayers is not None + ): + try: + if not utils.args.breakmodel_gpulayers: + breakmodel.gpu_blocks = [] + else: + breakmodel.gpu_blocks = list( + map(int, utils.args.breakmodel_gpulayers.split(",")) + ) + assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count() + s = n_layers + for i in range(len(breakmodel.gpu_blocks)): + if breakmodel.gpu_blocks[i] <= -1: + breakmodel.gpu_blocks[i] = s + break + else: + s -= breakmodel.gpu_blocks[i] + assert sum(breakmodel.gpu_blocks) <= n_layers + n_layers -= sum(breakmodel.gpu_blocks) + if utils.args.breakmodel_disklayers is not None: + assert utils.args.breakmodel_disklayers <= n_layers + breakmodel.disk_blocks = utils.args.breakmodel_disklayers + n_layers -= utils.args.breakmodel_disklayers + except: + logger.warning( + "--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0." + ) + breakmodel.gpu_blocks = [n_layers] + n_layers = 0 + elif utils.args.breakmodel_layers is not None: + breakmodel.gpu_blocks = [ + n_layers - max(0, min(n_layers, utils.args.breakmodel_layers)) + ] + n_layers -= sum(breakmodel.gpu_blocks) + elif utils.args.model is not None: + logger.info("Breakmodel not specified, assuming GPU 0") + breakmodel.gpu_blocks = [n_layers] + n_layers = 0 + else: + device_count = torch.cuda.device_count() + if device_count > 1: + print( + Colors.CYAN + + "\nPlease select one of your GPUs to be your primary GPU." + ) + print( + "VRAM usage in your primary GPU will be higher than for your other ones." + ) + print("It is recommended you make your fastest GPU your primary GPU.") + self.breakmodel_device_list(n_layers) + while True: + primaryselect = input("device ID> ") + if ( + primaryselect.isnumeric() + and 0 <= int(primaryselect) < device_count + ): + breakmodel.primary_device = int(primaryselect) + break + else: + print( + f"{Colors.RED}Please enter an integer between 0 and {device_count-1}.{Colors.END}" + ) + else: + breakmodel.primary_device = 0 + + print( + Colors.PURPLE + + "\nIf you don't have enough VRAM to run the model on a single GPU" + ) + print( + "you can split the model between your CPU and your GPU(s), or between" + ) + print("multiple GPUs if you have more than one.") + print("By putting more 'layers' on a GPU or CPU, more computations will be") + print( + "done on that device and more VRAM or RAM will be required on that device" + ) + print("(roughly proportional to number of layers).") + print( + "It should be noted that GPUs are orders of magnitude faster than the CPU." + ) + print( + f"This model has{Colors.YELLOW} {n_layers} {Colors.PURPLE}layers.{Colors.END}\n" + ) + + for i in range(device_count): + self.breakmodel_device_list( + n_layers, primary=breakmodel.primary_device, selected=i + ) + print( + f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n" + ) + while True: + layerselect = input("# of layers> ") + if ( + layerselect.isnumeric() or layerselect.strip() == "-1" + ) and -1 <= int(layerselect) <= n_layers: + layerselect = int(layerselect) + layerselect = n_layers if layerselect == -1 else layerselect + breakmodel.gpu_blocks.append(layerselect) + n_layers -= layerselect + break + else: + print( + f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}" + ) + if n_layers == 0: + break + + if n_layers > 0: + self.breakmodel_device_list( + n_layers, primary=breakmodel.primary_device, selected=-1 + ) + print( + f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n" + ) + while True: + layerselect = input("# of layers> ") + if ( + layerselect.isnumeric() or layerselect.strip() == "-1" + ) and -1 <= int(layerselect) <= n_layers: + layerselect = int(layerselect) + layerselect = n_layers if layerselect == -1 else layerselect + breakmodel.disk_blocks = layerselect + n_layers -= layerselect + break + else: + print( + f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}" + ) + + logger.init_ok("Final device configuration:", status="Info") + self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device) + + # If all layers are on the same device, use the old GPU generation mode + while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0: + breakmodel.gpu_blocks.pop() + if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in ( + -1, + utils.num_layers(config), + ): + utils.koboldai_vars.breakmodel = False + utils.koboldai_vars.usegpu = True + utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1 + return + + if not breakmodel.gpu_blocks: + logger.warning("Nothing assigned to a GPU, reverting to CPU only mode") + import breakmodel + + breakmodel.primary_device = "cpu" + utils.koboldai_vars.breakmodel = False + utils.koboldai_vars.usegpu = False + return diff --git a/modeling/inference_models/horde.py b/modeling/inference_models/horde.py new file mode 100644 index 00000000..1fea9b56 --- /dev/null +++ b/modeling/inference_models/horde.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +import time +import torch +import requests +import numpy as np +from typing import List, Union + +import utils +from logger import logger + +from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, +) + + +class HordeException(Exception): + """To be used for errors on server side of the Horde.""" + + +class HordeInferenceModel(InferenceModel): + def _load(self, save_model: bool, initial_load: bool) -> None: + self.tokenizer = self._get_tokenizer( + utils.koboldai_vars.cluster_requested_models[0] + if len(utils.koboldai_vars.cluster_requested_models) > 0 + else "gpt2", + ) + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ) -> GenerationResult: + decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + utils.koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + reqdata = { + "max_length": max_new, + "max_context_length": utils.koboldai_vars.max_length, + "rep_pen": gen_settings.rep_pen, + "rep_pen_slope": gen_settings.rep_pen_slope, + "rep_pen_range": gen_settings.rep_pen_range, + "temperature": gen_settings.temp, + "top_p": gen_settings.top_p, + "top_k": int(gen_settings.top_k), + "top_a": gen_settings.top_a, + "tfs": gen_settings.tfs, + "typical": gen_settings.typical, + "n": batch_count, + } + + cluster_metadata = { + "prompt": decoded_prompt, + "params": reqdata, + "models": [x for x in utils.koboldai_vars.cluster_requested_models if x], + "trusted_workers": False, + } + + client_agent = "KoboldAI:2.0.0:koboldai.org" + cluster_headers = { + "apikey": utils.koboldai_vars.horde_api_key, + "Client-Agent": client_agent, + } + + try: + # Create request + req = requests.post( + utils.koboldai_vars.colaburl[:-8] + "/api/v2/generate/text/async", + json=cluster_metadata, + headers=cluster_headers, + ) + except requests.exceptions.ConnectionError: + errmsg = f"Horde unavailable. Please try again later" + logger.error(errmsg) + raise HordeException(errmsg) + + if req.status_code == 503: + errmsg = f"KoboldAI API Error: No available KoboldAI servers found in Horde to fulfil this request using the selected models or other properties." + logger.error(errmsg) + raise HordeException(errmsg) + elif not req.ok: + errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." + logger.error(errmsg) + logger.error(f"HTTP {req.status_code}!!!") + logger.error(req.text) + raise HordeException(errmsg) + + try: + req_status = req.json() + except requests.exceptions.JSONDecodeError: + errmsg = f"Unexpected message received from the Horde: '{req.text}'" + logger.error(errmsg) + raise HordeException(errmsg) + + request_id = req_status["id"] + logger.debug("Horde Request ID: {}".format(request_id)) + + # We've sent the request and got the ID back, now we need to watch it to see when it finishes + finished = False + + cluster_agent_headers = {"Client-Agent": client_agent} + + while not finished: + try: + req = requests.get( + f"{utils.koboldai_vars.colaburl[:-8]}/api/v2/generate/text/status/{request_id}", + headers=cluster_agent_headers, + ) + except requests.exceptions.ConnectionError: + errmsg = f"Horde unavailable. Please try again later" + logger.error(errmsg) + raise HordeException(errmsg) + + if not req.ok: + errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." + logger.error(req.text) + raise HordeException(errmsg) + + try: + req_status = req.json() + except requests.exceptions.JSONDecodeError: + errmsg = ( + f"Unexpected message received from the KoboldAI Horde: '{req.text}'" + ) + logger.error(errmsg) + raise HordeException(errmsg) + + if "done" not in req_status: + errmsg = f"Unexpected response received from the KoboldAI Horde: '{req_status}'" + logger.error(errmsg) + raise HordeException(errmsg) + + finished = req_status["done"] + utils.koboldai_vars.horde_wait_time = req_status["wait_time"] + utils.koboldai_vars.horde_queue_position = req_status["queue_position"] + utils.koboldai_vars.horde_queue_size = req_status["waiting"] + + if not finished: + logger.debug(req_status) + time.sleep(1) + + logger.debug("Last Horde Status Message: {}".format(req_status)) + + if req_status["faulted"]: + raise HordeException("Horde Text generation faulted! Please try again.") + + generations = req_status["generations"] + gen_servers = [(cgen["worker_name"], cgen["worker_id"]) for cgen in generations] + logger.info(f"Generations by: {gen_servers}") + + return GenerationResult( + model=self, + out_batches=np.array( + [self.tokenizer.encode(cgen["text"]) for cgen in generations] + ), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) diff --git a/modeling/inference_models/legacy_gpt2_hf.py b/modeling/inference_models/legacy_gpt2_hf.py new file mode 100644 index 00000000..40f5bc8c --- /dev/null +++ b/modeling/inference_models/legacy_gpt2_hf.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +import os +import json +import traceback + +from transformers import GPT2LMHeadModel, GPT2Tokenizer + +import utils +from modeling.inference_models.hf_torch import HFTorchInferenceModel + + +class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel): + def _load(self, save_model: bool, initial_load: bool) -> None: + utils.koboldai_vars.lazy_load = False + + model_path = None + + for possible_config_path in [ + utils.koboldai_vars.custmodpth, + os.path.join("models", utils.koboldai_vars.custmodpth), + ]: + try: + with open( + os.path.join(possible_config_path, "config.json"), "r" + ) as file: + self.model_config = json.load(file) + model_path = possible_config_path + break + except FileNotFoundError: + pass + + if not model_path: + raise RuntimeError("Empty model_path!") + + with self._maybe_use_float16(): + try: + self.model = GPT2LMHeadModel.from_pretrained( + utils.koboldai_vars.custmodpth, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + ) + self.tokenizer = GPT2Tokenizer.from_pretrained( + utils.koboldai_vars.custmodpth, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + ) + except Exception as e: + if "out of memory" in traceback.format_exc().lower(): + raise RuntimeError( + "One of your GPUs ran out of memory when KoboldAI tried to load your model." + ) from e + raise e + + if save_model: + self.model.save_pretrained( + self.get_local_model_path(ignore_existance=True), + max_shard_size="500MiB", + ) + self.tokenizer.save_pretrained( + self.get_local_model_path(ignore_existance=True) + ) + + utils.koboldai_vars.modeldim = self.get_hidden_size() + + # Is CUDA available? If so, use GPU, otherwise fall back to CPU + if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu: + self.model = self.model.half().to(utils.koboldai_vars.gpu_device) + else: + self.model = self.model.to("cpu").float() + + self.patch_causal_lm() diff --git a/modeling/inference_models/openai.py b/modeling/inference_models/openai.py new file mode 100644 index 00000000..56f386e3 --- /dev/null +++ b/modeling/inference_models/openai.py @@ -0,0 +1,98 @@ +import torch +import requests +import numpy as np +from typing import List, Union + +import utils +from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, +) + + +class OpenAIAPIError(Exception): + def __init__(self, error_type: str, error_message) -> None: + super().__init__(f"{error_type}: {error_message}") + + +class OpenAIAPIInferenceModel(InferenceModel): + """InferenceModel for interfacing with OpenAI's generation API.""" + def _load(self, save_model: bool, initial_load: bool) -> None: + self.tokenizer = self._get_tokenizer("gpt2") + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ) -> GenerationResult: + # Taken mainly from oairequest() + + decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + utils.koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + # GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround + # as the koboldai_vars.model will always be OAI + if "GooseAI" in utils.koboldai_vars.configname: + reqdata = { + "prompt": decoded_prompt, + "max_tokens": max_new, + "temperature": gen_settings.temp, + "top_a": gen_settings.top_a, + "top_p": gen_settings.top_p, + "top_k": gen_settings.top_k, + "tfs": gen_settings.tfs, + "typical_p": gen_settings.typical, + "repetition_penalty": gen_settings.rep_pen, + "repetition_penalty_slope": gen_settings.rep_pen_slope, + "repetition_penalty_range": gen_settings.rep_pen_range, + "n": batch_count, + # TODO: Implement streaming + "stream": False, + } + else: + reqdata = { + "prompt": decoded_prompt, + "max_tokens": max_new, + "temperature": gen_settings.temp, + "top_p": gen_settings.top_p, + "frequency_penalty": gen_settings.rep_pen, + "n": batch_count, + "stream": False, + } + + req = requests.post( + utils.koboldai_vars.oaiurl, + json=reqdata, + headers={ + "Authorization": "Bearer " + utils.koboldai_vars.oaiapikey, + "Content-Type": "application/json", + }, + ) + + j = req.json() + + if not req.ok: + # Send error message to web client + if "error" in j: + error_type = j["error"]["type"] + error_message = j["error"]["message"] + else: + error_type = "Unknown" + error_message = "Unknown" + raise OpenAIAPIError(error_type, error_message) + + outputs = [out["text"] for out in j["choices"]] + return GenerationResult( + model=self, + out_batches=np.array([self.tokenizer.encode(x) for x in outputs]), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) diff --git a/modeling/patches.py b/modeling/patches.py new file mode 100644 index 00000000..f6b1ff0a --- /dev/null +++ b/modeling/patches.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +import copy +import requests +from typing import Iterable, List +from tqdm.auto import tqdm + +import transformers +from transformers import ( + PreTrainedModel, + modeling_utils, +) + +import utils + + +def patch_transformers_download(): + def http_get( + url: str, + temp_file, + proxies=None, + resume_size=0, + headers=None, + file_name=None, + ): + """ + Download remote file. Do not gobble up errors. + """ + headers = copy.deepcopy(headers) + if resume_size > 0: + headers["Range"] = f"bytes={resume_size}-" + r = requests.get(url, stream=True, proxies=proxies, headers=headers) + transformers.utils.hub._raise_for_status(r) + content_length = r.headers.get("Content-Length") + total = ( + resume_size + int(content_length) if content_length is not None else None + ) + + # `tqdm` behavior is determined by `utils.logging.is_progress_bar_enabled()` + # and can be set using `utils.logging.enable/disable_progress_bar()` + if url[-11:] != "config.json": + progress = tqdm.tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + total=total, + initial=resume_size, + desc=f"Downloading {file_name}" + if file_name is not None + else "Downloading", + file=utils.UIProgressBarFile(), + ) + utils.koboldai_vars.status_message = "Download Model" + utils.koboldai_vars.total_download_chunks = total + + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + if url[-11:] != "config.json": + progress.update(len(chunk)) + utils.koboldai_vars.downloaded_chunks += len(chunk) + temp_file.write(chunk) + + if url[-11:] != "config.json": + progress.close() + + utils.koboldai_vars.status_message = "" + + transformers.utils.hub.http_get = http_get + + +def patch_transformers_loader() -> None: + """ + Patch the Transformers loader to use aria2 and our shard tracking. + Universal for TPU/MTJ and Torch. + """ + old_from_pretrained = PreTrainedModel.from_pretrained.__func__ + + @classmethod + def new_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + utils.koboldai_vars.fp32_model = False + utils.num_shards = None + utils.current_shard = 0 + utils.from_pretrained_model_name = pretrained_model_name_or_path + utils.from_pretrained_index_filename = None + utils.from_pretrained_kwargs = kwargs + utils.bar = None + if not utils.args.no_aria2: + utils.aria2_hook(pretrained_model_name_or_path, **kwargs) + return old_from_pretrained( + cls, pretrained_model_name_or_path, *model_args, **kwargs + ) + + if not hasattr(PreTrainedModel, "_kai_patched"): + PreTrainedModel.from_pretrained = new_from_pretrained + PreTrainedModel._kai_patched = True + + if hasattr(modeling_utils, "get_checkpoint_shard_files"): + old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files + + def new_get_checkpoint_shard_files( + pretrained_model_name_or_path, index_filename, *args, **kwargs + ): + utils.num_shards = utils.get_num_shards(index_filename) + utils.from_pretrained_index_filename = index_filename + return old_get_checkpoint_shard_files( + pretrained_model_name_or_path, index_filename, *args, **kwargs + ) + + modeling_utils.get_checkpoint_shard_files = new_get_checkpoint_shard_files + + +def patch_transformers_generation() -> None: + # Not sure why this global is needed... + global transformers + + # Allow bad words filter to ban <|endoftext|> token + import transformers.generation.logits_process + + def new_init(self, bad_words_ids: List[List[int]], eos_token_id: int): + return new_init.old_init(self, bad_words_ids, -1) + + new_init.old_init = ( + transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ + ) + transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ = new_init + + +def patch_transformers() -> None: + patch_transformers_download() + patch_transformers_loader() + + # Doesn't do anything for TPU + patch_transformers_generation() diff --git a/modeling/post_token_hooks.py b/modeling/post_token_hooks.py new file mode 100644 index 00000000..750ce137 --- /dev/null +++ b/modeling/post_token_hooks.py @@ -0,0 +1,27 @@ +import torch + +import utils +from modeling.inference_model import InferenceModel + + +class PostTokenHooks: + @staticmethod + def stream_tokens( + model: InferenceModel, + input_ids: torch.LongTensor, + ) -> None: + if not model.gen_state["do_streaming"]: + return + + if not utils.koboldai_vars.output_streaming: + return + + data = [ + utils.applyoutputformatting( + utils.decodenewlines(model.tokenizer.decode(x[-1])), + no_sentence_trimming=True, + no_single_line=True, + ) + for x in input_ids + ] + utils.koboldai_vars.actions.stream_tokens(data) diff --git a/modeling/stoppers.py b/modeling/stoppers.py new file mode 100644 index 00000000..2cb8af49 --- /dev/null +++ b/modeling/stoppers.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +import torch + +import utils +from modeling.inference_model import ( + InferenceModel, +) + + +class Stoppers: + @staticmethod + def core_stopper( + model: InferenceModel, + input_ids: torch.LongTensor, + ) -> bool: + if not utils.koboldai_vars.inference_config.do_core: + return False + + utils.koboldai_vars.generated_tkns += 1 + + if ( + not utils.koboldai_vars.standalone + and utils.koboldai_vars.lua_koboldbridge.generated_cols + and utils.koboldai_vars.generated_tkns + != utils.koboldai_vars.lua_koboldbridge.generated_cols + ): + raise RuntimeError( + f"Inconsistency detected between KoboldAI Python and Lua backends ({utils.koboldai_vars.generated_tkns} != {utils.koboldai_vars.lua_koboldbridge.generated_cols})" + ) + + if utils.koboldai_vars.abort or ( + utils.koboldai_vars.inference_config.stop_at_genamt + and utils.koboldai_vars.generated_tkns >= utils.koboldai_vars.genamt + ): + utils.koboldai_vars.abort = False + model.gen_state["regeneration_required"] = False + model.gen_state["halt"] = False + return True + + if utils.koboldai_vars.standalone: + return False + + assert input_ids.ndim == 2 + + model.gen_state[ + "regeneration_required" + ] = utils.koboldai_vars.lua_koboldbridge.regeneration_required + model.gen_state["halt"] = not utils.koboldai_vars.lua_koboldbridge.generating + utils.koboldai_vars.lua_koboldbridge.regeneration_required = False + + for i in ( + range(utils.koboldai_vars.numseqs) + if not utils.koboldai_vars.alt_multi_gen + else range(1) + ): + utils.koboldai_vars.lua_koboldbridge.generated[i + 1][ + utils.koboldai_vars.generated_tkns + ] = int(input_ids[i, -1].item()) + + return model.gen_state["regeneration_required"] or model.gen_state["halt"] + + @staticmethod + def dynamic_wi_scanner( + model: InferenceModel, + input_ids: torch.LongTensor, + ) -> bool: + if not utils.koboldai_vars.inference_config.do_dynamic_wi: + return False + + if not utils.koboldai_vars.dynamicscan: + return False + + if len(model.gen_state["wi_scanner_excluded_keys"]) != input_ids.shape[0]: + print(model.tokenizer.decode(model.gen_state["wi_scanner_excluded_keys"])) + print(model.tokenizer.decode(input_ids.shape[0])) + + assert len(model.gen_state["wi_scanner_excluded_keys"]) == input_ids.shape[0] + + tail = input_ids[..., -utils.koboldai_vars.generated_tkns :] + for i, t in enumerate(tail): + decoded = utils.decodenewlines(model.tokenizer.decode(t)) + _, _, _, found = utils.koboldai_vars.calc_ai_text( + submitted_text=decoded, send_context=False + ) + found = list( + set(found) - set(model.gen_state["wi_scanner_excluded_keys"][i]) + ) + if found: + print("FOUNDWI", found) + return True + return False + + @staticmethod + def chat_mode_stopper( + model: InferenceModel, + input_ids: torch.LongTensor, + ) -> bool: + if not utils.koboldai_vars.chatmode: + return False + + data = [model.tokenizer.decode(x) for x in input_ids] + # null_character = model.tokenizer.encode(chr(0))[0] + if "completed" not in model.gen_state: + model.gen_state["completed"] = [False] * len(input_ids) + + for i in range(len(input_ids)): + if ( + data[i][-1 * (len(utils.koboldai_vars.chatname) + 1) :] + == utils.koboldai_vars.chatname + ":" + ): + model.gen_state["completed"][i] = True + if all(model.gen_state["completed"]): + utils.koboldai_vars.generated_tkns = utils.koboldai_vars.genamt + del model.gen_state["completed"] + return True + return False diff --git a/warpers.py b/modeling/warpers.py similarity index 100% rename from warpers.py rename to modeling/warpers.py diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py index 5afeff50..c01210cd 100644 --- a/tpu_mtj_backend.py +++ b/tpu_mtj_backend.py @@ -54,7 +54,7 @@ from mesh_transformer.transformer_shard import CausalTransformer, CausalTransfor from mesh_transformer.util import to_bf16 import time -import warpers +import modeling.warpers as warpers socketio = None