From 6b4905de30e430c7f80b368b50f22a668b376d94 Mon Sep 17 00:00:00 2001 From: somebody Date: Sat, 25 Feb 2023 16:05:56 -0600 Subject: [PATCH] Model: Port rest of models over Generation's still broke but it's a start --- aiserver.py | 1586 +------------------------------------- model.py | 2129 ++++++++++++++++++++++++++++++++++++++++++++++----- utils.py | 32 +- 3 files changed, 1987 insertions(+), 1760 deletions(-) diff --git a/aiserver.py b/aiserver.py index d872c2fc..49b97e06 100644 --- a/aiserver.py +++ b/aiserver.py @@ -81,7 +81,6 @@ except: from transformers import GenerationMixin -from model import GenericHFTorchInferenceModel, CustomGPT2HFTorchInferenceModel # Text2img import base64 from PIL import Image @@ -117,33 +116,6 @@ def new_pretrainedtokenizerbase_from_pretrained(cls, *args, **kwargs): return tokenizer PreTrainedTokenizerBase.from_pretrained = new_pretrainedtokenizerbase_from_pretrained -# We only want to use logit manipulations and such on our core text model -class use_core_manipulations: - # These must be set by wherever they get setup - get_logits_processor: callable - sample: callable - get_stopping_criteria: callable - - # We set these automatically - old_get_logits_processor: callable - old_sample: callable - old_get_stopping_criteria: callable - - def __enter__(self): - use_core_manipulations.old_get_logits_processor = transformers.GenerationMixin._get_logits_processor - transformers.GenerationMixin._get_logits_processor = use_core_manipulations.get_logits_processor - - use_core_manipulations.old_sample = transformers.GenerationMixin.sample - transformers.GenerationMixin.sample = use_core_manipulations.sample - - use_core_manipulations.old_get_stopping_criteria = transformers.GenerationMixin._get_stopping_criteria - transformers.GenerationMixin._get_stopping_criteria = use_core_manipulations.get_stopping_criteria - return self - - def __exit__(self, exc_type, exc_value, exc_traceback): - transformers.GenerationMixin._get_logits_processor = use_core_manipulations.old_get_logits_processor - transformers.GenerationMixin.sample = use_core_manipulations.old_sample - transformers.GenerationMixin._get_stopping_criteria = use_core_manipulations.old_get_stopping_criteria #==================================================================# # Variables & Storage @@ -179,7 +151,7 @@ model_menu = { ["Untuned Fairseq Dense", "fsdlist", "", True], ["Untuned Bloom", "bloomlist", "", True], ["Untuned XGLM", "xglmlist", "", True], - ["Untuned RWKV-4 (Experimental)", "rwkvlist", "", True], + # ["Untuned RWKV-4 (Experimental)", "rwkvlist", "", True], ["Untuned GPT2", "gpt2list", "", True], ["Online Services", "apilist", "", True], ["Read Only (No AI)", "ReadOnly", "", False] @@ -305,19 +277,19 @@ model_menu = { ["XGLM 564M", "facebook/xglm-564M", "4GB", False], ["Return to Main Menu", "mainmenu", "", True], ], - 'rwkvlist': [ - ["RWKV-4 7B (GPU)", "RWKV-7B-GPU", "??GB", False], - ["RWKV-4 7B (CPU)", "RWKV-7B-CPU", "??GB", False], - ["RWKV-4 3B (GPU)", "RWKV-3B-GPU", "?GB", False], - ["RWKV-4 3B (CPU)", "RWKV-3B-CPU", "?GB", False], - ["RWKV-4 1.5B (GPU)", "RWKV-1B5-GPU", "9GB", False], - ["RWKV-4 1.5B (CPU)", "RWKV-1B5-CPU", "6GB", False], - ["RWKV-4 340M (GPU)", "RWKV-340M-GPU", "?GB", False], - ["RWKV-4 340M (CPU)", "RWKV-340M-CPU", "?GB", False], - ["RWKV-4 169M (GPU)", "RWKV-169M-GPU", "?GB", False], - ["RWKV-4 169M (CPU)", "RWKV-169M-CPU", "?GB", False], - ["Return to Main Menu", "mainmenu", "", True], - ], + # 'rwkvlist': [ + # ["RWKV-4 7B (GPU)", "RWKV-7B-GPU", "??GB", False], + # ["RWKV-4 7B (CPU)", "RWKV-7B-CPU", "??GB", False], + # ["RWKV-4 3B (GPU)", "RWKV-3B-GPU", "?GB", False], + # ["RWKV-4 3B (CPU)", "RWKV-3B-CPU", "?GB", False], + # ["RWKV-4 1.5B (GPU)", "RWKV-1B5-GPU", "9GB", False], + # ["RWKV-4 1.5B (CPU)", "RWKV-1B5-CPU", "6GB", False], + # ["RWKV-4 340M (GPU)", "RWKV-340M-GPU", "?GB", False], + # ["RWKV-4 340M (CPU)", "RWKV-340M-CPU", "?GB", False], + # ["RWKV-4 169M (GPU)", "RWKV-169M-GPU", "?GB", False], + # ["RWKV-4 169M (CPU)", "RWKV-169M-CPU", "?GB", False], + # ["Return to Main Menu", "mainmenu", "", True], + # ], 'apilist': [ ["GooseAI API (requires API key)", "GooseAI", "None", False], ["OpenAI API (requires API key)", "OAI", "None", False], @@ -562,10 +534,12 @@ logger.add(UI_2_log_history, serialize=True, colorize=True, enqueue=True, level= #logger.add("log_file_1.log", rotation="500 MB") # Automatically rotate too big file koboldai_vars = koboldai_settings.koboldai_vars(socketio) - utils.koboldai_vars = koboldai_vars utils.socketio = socketio +# HACK: Weird import position to steal koboldai_vars from utils +from model import GenericHFTorchInferenceModel, CustomGPT2HFTorchInferenceModel, HFMTJInferenceModel, patch_transformers + old_socketio_on = socketio.on def new_socketio_on(*a, **k): decorator = old_socketio_on(*a, **k) @@ -1417,26 +1391,6 @@ def general_startup(override_args=None): # Load Model #==================================================================# -def tpumtjgetsofttokens(): - soft_tokens = None - if(koboldai_vars.sp is None): - tensor = np.zeros((1, tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"])), dtype=np.float32) - rows = tensor.shape[0] - padding_amount = tpu_mtj_backend.params["seq"] - (tpu_mtj_backend.params["seq"] % -tpu_mtj_backend.params["cores_per_replica"]) - rows - tensor = np.pad(tensor, ((0, padding_amount), (0, 0))) - tensor = tensor.reshape( - tpu_mtj_backend.params["cores_per_replica"], - -1, - tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"]), - ) - koboldai_vars.sp = tpu_mtj_backend.shard_xmap(tensor) - soft_tokens = np.arange( - tpu_mtj_backend.params["n_vocab"] + tpu_mtj_backend.params["n_vocab_padding"], - tpu_mtj_backend.params["n_vocab"] + tpu_mtj_backend.params["n_vocab_padding"] + koboldai_vars.sp_length, - dtype=np.uint32 - ) - return soft_tokens - @socketio.on("get_model_info") def get_model_info(model, directory=""): logger.info("Selected: {}, {}".format(model, directory)) @@ -1740,649 +1694,6 @@ def get_cluster_models(msg): emit('from_server', {'cmd': 'oai_engines', 'data': engines, 'online_model': online_model}, broadcast=True) -def patch_transformers_download(): - global transformers - import copy, requests, tqdm, time - class Send_to_socketio(object): - def write(self, bar): - bar = bar.replace("\r", "").replace("\n", "") - - if bar != "" and [ord(num) for num in bar] != [27, 91, 65]: #No idea why we're getting the 27, 1, 65 character set, just killing to so we can move on - try: - print('\r' + bar, end='') - socketio.emit('from_server', {'cmd': 'model_load_status', 'data': bar.replace(" ", " ")}, broadcast=True, room="UI_1") - eventlet.sleep(seconds=0) - except: - pass - def flush(self): - pass - - def http_get( - url: str, - temp_file, - proxies=None, - resume_size=0, - headers=None, - file_name=None, - ): - """ - Download remote file. Do not gobble up errors. - """ - headers = copy.deepcopy(headers) - if resume_size > 0: - headers["Range"] = f"bytes={resume_size}-" - r = requests.get(url, stream=True, proxies=proxies, headers=headers) - transformers.utils.hub._raise_for_status(r) - content_length = r.headers.get("Content-Length") - total = resume_size + int(content_length) if content_length is not None else None - # `tqdm` behavior is determined by `utils.logging.is_progress_bar_enabled()` - # and can be set using `utils.logging.enable/disable_progress_bar()` - if url[-11:] != 'config.json': - progress = tqdm.tqdm( - unit="B", - unit_scale=True, - unit_divisor=1024, - total=total, - initial=resume_size, - desc=f"Downloading {file_name}" if file_name is not None else "Downloading", - file=Send_to_socketio(), - ) - koboldai_vars.status_message = "Download Model" - koboldai_vars.total_download_chunks = total - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - if url[-11:] != 'config.json': - progress.update(len(chunk)) - koboldai_vars.downloaded_chunks += len(chunk) - temp_file.write(chunk) - if url[-11:] != 'config.json': - progress.close() - - koboldai_vars.status_message = "" - - # def http_get( - # url: str, - # temp_file: BinaryIO, - # *, - # proxies=None, - # resume_size=0, - # headers: Optional[Dict[str, str]] = None, - # timeout=10.0, - # max_retries=0, - # ): - # """ - # Donwload a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub. - # """ - # headers = copy.deepcopy(headers) - # if resume_size > 0: - # headers["Range"] = "bytes=%d-" % (resume_size,) - # r = _request_wrapper( - # method="GET", - # url=url, - # stream=True, - # proxies=proxies, - # headers=headers, - # timeout=timeout, - # max_retries=max_retries, - # ) - # hf_raise_for_status(r) - # content_length = r.headers.get("Content-Length") - # total = resume_size + int(content_length) if content_length is not None else None - # progress = tqdm( - # unit="B", - # unit_scale=True, - # total=total, - # initial=resume_size, - # desc="Downloading", - # file=Send_to_socketio(), - # disable=bool(logger.getEffectiveLevel() == logging.NOTSET), - # ) - # for chunk in r.iter_content(chunk_size=1024): - # if chunk: # filter out keep-alive new chunks - # progress.update(len(chunk)) - # temp_file.write(chunk) - # progress.close() - - transformers.utils.hub.http_get = http_get - - -def patch_transformers(): - global transformers - - patch_transformers_download() - - old_from_pretrained = PreTrainedModel.from_pretrained.__func__ - @classmethod - def new_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - koboldai_vars.fp32_model = False - utils.num_shards = None - utils.current_shard = 0 - utils.from_pretrained_model_name = pretrained_model_name_or_path - utils.from_pretrained_index_filename = None - utils.from_pretrained_kwargs = kwargs - utils.bar = None - if not args.no_aria2: - utils.aria2_hook(pretrained_model_name_or_path, **kwargs) - return old_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs) - if(not hasattr(PreTrainedModel, "_kai_patched")): - PreTrainedModel.from_pretrained = new_from_pretrained - PreTrainedModel._kai_patched = True - if(hasattr(modeling_utils, "get_checkpoint_shard_files")): - old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files - def new_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs): - utils.num_shards = utils.get_num_shards(index_filename) - utils.from_pretrained_index_filename = index_filename - return old_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs) - modeling_utils.get_checkpoint_shard_files = new_get_checkpoint_shard_files - - # Some versions of transformers 4.17.0.dev0 are affected by - # https://github.com/huggingface/transformers/issues/15736 - # This is a workaround for those versions of transformers. - if(transformers_version == "4.17.0.dev0"): - try: - from transformers.models.xglm.modeling_xglm import XGLMSinusoidalPositionalEmbedding - except ImportError: - pass - else: - @torch.no_grad() - def new_forward(self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0): - bsz, seq_len = inputs_embeds.size()[:-1] - input_shape = inputs_embeds.size()[:-1] - sequence_length = input_shape[1] - position_ids = torch.arange( - past_key_values_length + self.padding_idx + 1, past_key_values_length + sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device - ).unsqueeze(0).expand(input_shape).contiguous() - max_pos = self.padding_idx + 1 + seq_len + past_key_values_length - if max_pos > self.weights.size(0): - self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx) - return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach() - XGLMSinusoidalPositionalEmbedding.forward = new_forward - - - # Fix a bug in OPTForCausalLM where self.lm_head is the wrong size - if(packaging.version.parse("4.19.0.dev0") <= packaging.version.parse(transformers_version) < packaging.version.parse("4.20.0")): - try: - from transformers import OPTForCausalLM, OPTModel - except ImportError: - pass - else: - # This is the same as the original __init__ but with - # config.hidden_size - # replaced with - # config.word_embed_proj_dim - def new_init(self, config): - super(OPTForCausalLM, self).__init__(config) - self.model = OPTModel(config) - self.lm_head = torch.nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False) - self.post_init() - OPTForCausalLM.__init__ = new_init - - - # Patch transformers to use our custom logit warpers - from transformers import LogitsProcessorList, LogitsWarper, LogitsProcessor, TopKLogitsWarper, TopPLogitsWarper, TemperatureLogitsWarper, RepetitionPenaltyLogitsProcessor - from warpers import AdvancedRepetitionPenaltyLogitsProcessor, TailFreeLogitsWarper, TypicalLogitsWarper, TopALogitsWarper - - def dynamic_processor_wrap(cls, field_name, var_name, cond=None): - old_call = cls.__call__ - def new_call(self, *args, **kwargs): - if(not isinstance(field_name, str) and isinstance(field_name, Iterable)): - conds = [] - for f, v in zip(field_name, var_name): - conds.append(getattr(koboldai_vars, v)) - setattr(self, f, conds[-1]) - else: - conds = getattr(koboldai_vars, var_name) - setattr(self, field_name, conds) - assert len(args) == 2 - if(cond is None or cond(conds)): - return old_call(self, *args, **kwargs) - return args[1] - cls.__call__ = new_call - dynamic_processor_wrap(AdvancedRepetitionPenaltyLogitsProcessor, ("penalty", "penalty_slope", "penalty_range", "use_alt_rep_pen"), ("rep_pen", "rep_pen_slope", "rep_pen_range", "use_alt_rep_pen"), cond=lambda x: x[0] != 1.0) - dynamic_processor_wrap(TopKLogitsWarper, "top_k", "top_k", cond=lambda x: x > 0) - dynamic_processor_wrap(TopALogitsWarper, "top_a", "top_a", cond=lambda x: x > 0.0) - dynamic_processor_wrap(TopPLogitsWarper, "top_p", "top_p", cond=lambda x: x < 1.0) - dynamic_processor_wrap(TailFreeLogitsWarper, "tfs", "tfs", cond=lambda x: x < 1.0) - dynamic_processor_wrap(TypicalLogitsWarper, "typical", "typical", cond=lambda x: x < 1.0) - dynamic_processor_wrap(TemperatureLogitsWarper, "temperature", "temp", cond=lambda x: x != 1.0) - - class PhraseBiasLogitsProcessor(LogitsProcessor): - def __init__(self): - pass - - def _find_intersection(self, big: List, small: List) -> int: - """Find the maximum overlap between the beginning of small and the end of big. - Return the index of the token in small following the overlap, or 0. - - big: The tokens in the context (as a tensor) - small: The tokens in the phrase to bias (as a list) - - Both big and small are in "oldest to newest" order. - """ - # There are asymptotically more efficient methods for determining the overlap, - # but typically there will be few (0-1) instances of small[0] in the last len(small) - # elements of big, plus small will typically be fairly short. So this naive - # approach is acceptable despite O(N^2) worst case performance. - - num_small = len(small) - # The small list can only ever match against at most num_small tokens of big, - # so create a slice. Typically, this slice will be as long as small, but it - # may be shorter if the story has just started. - # We need to convert the big slice to list, since natively big is a tensor - # and tensor and list don't ever compare equal. It's better to convert here - # and then use native equality tests than to iterate repeatedly later. - big_slice = list(big[-num_small:]) - - # It's possible that the start token appears multiple times in small - # For example, consider the phrase: - # [ fair is foul, and foul is fair, hover through the fog and filthy air] - # If we merely look for the first instance of [ fair], then we would - # generate the following output: - # " fair is foul, and foul is fair is foul, and foul is fair..." - start = small[0] - for i, t in enumerate(big_slice): - # Strictly unnecessary, but it's marginally faster to test the first - # token before creating slices to test for a full match. - if t == start: - remaining = len(big_slice) - i - if big_slice[i:] == small[:remaining]: - # We found a match. If the small phrase has any remaining tokens - # then return the index of the next token. - if remaining < num_small: - return remaining - # In this case, the entire small phrase matched, so start over. - return 0 - - # There were no matches, so just begin at the beginning. - return 0 - - def _allow_leftwards_tampering(self, phrase: str) -> bool: - """Determines if a phrase should be tampered with from the left in - the "soft" token encoding mode.""" - - if phrase[0] in [".", "?", "!", ";", ":", "\n"]: - return False - return True - - def _get_token_sequence(self, phrase: str) -> List[List]: - """Convert the phrase string into a list of encoded biases, each - one being a list of tokens. How this is done is determined by the - phrase's format: - - - If the phrase is surrounded by square brackets ([]), the tokens - will be the phrase split by commas (,). If a "token" isn't - actually a number, it will be skipped. NOTE: Tokens output by - this may not be in the model's vocabulary, and such tokens - should be ignored later in the pipeline. - - If the phrase is surrounded by curly brackets ({}), the phrase - will be directly encoded with no synonym biases and no fancy - tricks. - - Otherwise, the phrase will be encoded, with close deviations - being included as synonym biases. - """ - - # TODO: Cache these tokens, invalidate when model or bias is - # changed. - - # Handle direct token id input - if phrase.startswith("[") and phrase.endswith("]"): - no_brackets = phrase[1:-1] - ret = [] - for token_id in no_brackets.split(","): - try: - ret.append(int(token_id)) - except ValueError: - # Ignore non-numbers. Rascals! - pass - return [ret] - - # Handle direct phrases - if phrase.startswith("{") and phrase.endswith("}"): - no_brackets = phrase[1:-1] - return [tokenizer.encode(no_brackets)] - - # Handle untamperable phrases - if not self._allow_leftwards_tampering(phrase): - return [tokenizer.encode(phrase)] - - # Handle slight alterations to original phrase - phrase = phrase.strip(" ") - ret = [] - - for alt_phrase in [phrase, f" {phrase}"]: - ret.append(tokenizer.encode(alt_phrase)) - - return ret - - def _get_biased_tokens(self, input_ids: List) -> Dict: - # TODO: Different "bias slopes"? - - ret = {} - for phrase, _bias in koboldai_vars.biases.items(): - bias_score, completion_threshold = _bias - token_seqs = self._get_token_sequence(phrase) - variant_deltas = {} - for token_seq in token_seqs: - bias_index = self._find_intersection(input_ids, token_seq) - - # Ensure completion after completion_threshold tokens - # Only provide a positive bias when the base bias score is positive. - if bias_score > 0 and bias_index + 1 > completion_threshold: - bias_score = 999 - - token_to_bias = token_seq[bias_index] - variant_deltas[token_to_bias] = bias_score - - # If multiple phrases bias the same token, add the modifiers - # together. This should NOT be applied to automatic variants - for token_to_bias, bias_score in variant_deltas.items(): - if token_to_bias in ret: - ret[token_to_bias] += bias_score - else: - ret[token_to_bias] = bias_score - return ret - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - assert scores.ndim == 2 - assert input_ids.ndim == 2 - - scores_shape = scores.shape - - for batch in range(scores_shape[0]): - for token, bias in self._get_biased_tokens(input_ids[batch]).items(): - scores[batch][token] += bias - - return scores - - - class LuaLogitsProcessor(LogitsProcessor): - - def __init__(self): - pass - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - assert scores.ndim == 2 - assert input_ids.ndim == 2 - self.regeneration_required = False - self.halt = False - - if(koboldai_vars.standalone): - return scores - - scores_shape = scores.shape - scores_list = scores.tolist() - koboldai_vars.lua_koboldbridge.logits = koboldai_vars.lua_state.table() - for r, row in enumerate(scores_list): - koboldai_vars.lua_koboldbridge.logits[r+1] = koboldai_vars.lua_state.table(*row) - koboldai_vars.lua_koboldbridge.vocab_size = scores_shape[-1] - - execute_genmod() - - scores = torch.tensor( - tuple(tuple(row.values()) for row in koboldai_vars.lua_koboldbridge.logits.values()), - device=scores.device, - dtype=scores.dtype, - ) - assert scores.shape == scores_shape - - return scores - - from torch.nn import functional as F - - def visualize_probabilities(scores: torch.FloatTensor) -> None: - assert scores.ndim == 2 - - if koboldai_vars.numseqs > 1 or not koboldai_vars.show_probs: - return - - if not koboldai_vars.show_probs: - return scores - - option_offset = 0 - if koboldai_vars.actions.action_count+1 in koboldai_vars.actions.actions: - for x in range(len(koboldai_vars.actions.actions[koboldai_vars.actions.action_count+1]['Options'])): - option = koboldai_vars.actions.actions[koboldai_vars.actions.action_count+1]['Options'][x] - if option['Pinned'] or option["Previous Selection"] or option["Edited"]: - option_offset = x+1 - batch_offset = int((koboldai_vars.generated_tkns-1) / koboldai_vars.genamt) if koboldai_vars.alt_multi_gen else 0 - for batch_index, batch in enumerate(scores): - probs = F.softmax(batch, dim = -1).cpu().numpy() - - token_prob_info = [] - for token_id, score in sorted(enumerate(probs), key=lambda x: x[1], reverse=True)[:8]: - token_prob_info.append({ - "tokenId": token_id, - "decoded": utils.decodenewlines(tokenizer.decode(token_id)), - "score": float(score), - }) - - - if koboldai_vars.numseqs == 1: - koboldai_vars.actions.set_probabilities(token_prob_info) - else: - koboldai_vars.actions.set_option_probabilities(token_prob_info, batch_index+option_offset+batch_offset) - - return scores - - def new_get_logits_processor(*args, **kwargs) -> LogitsProcessorList: - processors = new_get_logits_processor.old_get_logits_processor(*args, **kwargs) - processors.insert(0, LuaLogitsProcessor()) - processors.append(PhraseBiasLogitsProcessor()) - return processors - use_core_manipulations.get_logits_processor = new_get_logits_processor - new_get_logits_processor.old_get_logits_processor = transformers.GenerationMixin._get_logits_processor - - class KoboldLogitsWarperList(LogitsProcessorList): - def __init__(self, beams: int = 1, **kwargs): - self.__warper_list: List[LogitsWarper] = [] - self.__warper_list.append(TopKLogitsWarper(top_k=1, min_tokens_to_keep=1 + (beams > 1))) - self.__warper_list.append(TopALogitsWarper(top_a=0.5, min_tokens_to_keep=1 + (beams > 1))) - self.__warper_list.append(TopPLogitsWarper(top_p=0.5, min_tokens_to_keep=1 + (beams > 1))) - self.__warper_list.append(TailFreeLogitsWarper(tfs=0.5, min_tokens_to_keep=1 + (beams > 1))) - self.__warper_list.append(TypicalLogitsWarper(typical=0.5, min_tokens_to_keep=1 + (beams > 1))) - self.__warper_list.append(TemperatureLogitsWarper(temperature=0.5)) - self.__warper_list.append(AdvancedRepetitionPenaltyLogitsProcessor()) - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, *args, **kwargs): - sampler_order = koboldai_vars.sampler_order[:] - if len(sampler_order) < 7: # Add repetition penalty at beginning if it's not present - sampler_order = [6] + sampler_order - for k in sampler_order: - scores = self.__warper_list[k](input_ids, scores, *args, **kwargs) - visualize_probabilities(scores) - return scores - - def new_get_logits_warper(beams: int = 1,) -> LogitsProcessorList: - return KoboldLogitsWarperList(beams=beams) - - def new_sample(self, *args, **kwargs): - assert kwargs.pop("logits_warper", None) is not None - kwargs["logits_warper"] = new_get_logits_warper( - beams=1, - ) - if(koboldai_vars.newlinemode == "s") or (koboldai_vars.newlinemode == "ns"): - kwargs["eos_token_id"] = -1 - kwargs.setdefault("pad_token_id", 2) - return new_sample.old_sample(self, *args, **kwargs) - - new_sample.old_sample = transformers.GenerationMixin.sample - use_core_manipulations.sample = new_sample - - # Allow bad words filter to ban <|endoftext|> token - import transformers.generation.logits_process - def new_init(self, bad_words_ids: List[List[int]], eos_token_id: int): - return new_init.old_init(self, bad_words_ids, -1) - new_init.old_init = transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ - transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ = new_init - - class TokenStreamer(StoppingCriteria): - # A StoppingCriteria is used here because it seems to run after - # everything has been evaluated score-wise. - def __init__(self, tokenizer): - self.tokenizer = tokenizer - - def __call__( - self, - input_ids: torch.LongTensor, - scores: torch.FloatTensor, - **kwargs, - ) -> bool: - if not koboldai_vars.inference_config.do_streaming: - return False - - if not koboldai_vars.output_streaming: - return False - - data = [applyoutputformatting(utils.decodenewlines(tokenizer.decode(x[-1])), no_sentence_trimming=True, no_single_line=True) for x in input_ids] - koboldai_vars.actions.stream_tokens(data) - return False - - class ChatModeStopper(StoppingCriteria): - # A StoppingCriteria is used here because it seems to run after - # everything has been evaluated score-wise. - def __init__(self, tokenizer): - self.tokenizer = tokenizer - - def __call__( - self, - input_ids: torch.LongTensor, - scores: torch.FloatTensor, - **kwargs, - ) -> bool: - - if not koboldai_vars.chatmode: - return False - - data = [tokenizer.decode(x) for x in input_ids] - null_character = tokenizer.encode(chr(0))[0] - if 'completed' not in self.__dict__: - self.completed = [False]*len(input_ids) - for i in range(len(input_ids)): - if data[i][-1*(len(koboldai_vars.chatname)+1):] == koboldai_vars.chatname + ":": - self.completed[i] = True - if all(self.completed): - koboldai_vars.generated_tkns = koboldai_vars.genamt - del self.completed - return True - return False - - - class CoreStopper(StoppingCriteria): - # Controls core generation stuff; aborting, counting generated tokens, etc - def __init__(self): - self.regeneration_required = False - self.halt = False - - def __call__( - self, - input_ids: torch.LongTensor, - scores: torch.FloatTensor, - **kwargs, - ) -> bool: - if not koboldai_vars.inference_config.do_core: - return False - - koboldai_vars.generated_tkns += 1 - - if ( - not koboldai_vars.standalone - and koboldai_vars.lua_koboldbridge.generated_cols - and koboldai_vars.generated_tkns != koboldai_vars.lua_koboldbridge.generated_cols - ): - raise RuntimeError(f"Inconsistency detected between KoboldAI Python and Lua backends ({koboldai_vars.generated_tkns} != {koboldai_vars.lua_koboldbridge.generated_cols})") - - if ( - koboldai_vars.abort - or ( - koboldai_vars.inference_config.stop_at_genamt - and - koboldai_vars.generated_tkns >= koboldai_vars.genamt - ) - ): - koboldai_vars.abort = False - self.regeneration_required = False - self.halt = False - return True - - if koboldai_vars.standalone: - return False - - assert input_ids.ndim == 2 - - self.regeneration_required = koboldai_vars.lua_koboldbridge.regeneration_required - self.halt = not koboldai_vars.lua_koboldbridge.generating - koboldai_vars.lua_koboldbridge.regeneration_required = False - - for i in range(koboldai_vars.numseqs) if not koboldai_vars.alt_multi_gen else range(1): - koboldai_vars.lua_koboldbridge.generated[i+1][koboldai_vars.generated_tkns] = int(input_ids[i, -1].item()) - - return self.regeneration_required or self.halt - - - # Sets up dynamic world info scanner - class DynamicWorldInfoScanCriteria(StoppingCriteria): - def __init__( - self, - tokenizer, - excluded_world_info: List[Set], - ): - self.tokenizer = tokenizer - self.excluded_world_info = excluded_world_info - - def __call__( - self, - input_ids: torch.LongTensor, - scores: torch.FloatTensor, - **kwargs, - ) -> bool: - - if not koboldai_vars.inference_config.do_dynamic_wi: - return False - - if not koboldai_vars.dynamicscan: - return False - - if len(self.excluded_world_info) != input_ids.shape[0]: - print(tokenizer.decode(self.excluded_world_info)) - print(tokenizer.decode(input_ids.shape[0])) - assert len(self.excluded_world_info) == input_ids.shape[0] - - tail = input_ids[..., -koboldai_vars.generated_tkns:] - for i, t in enumerate(tail): - decoded = utils.decodenewlines(tokenizer.decode(t)) - _, _, _, found = koboldai_vars.calc_ai_text(submitted_text=decoded, send_context=False) - found = list(set(found) - set(self.excluded_world_info[i])) - if len(found) != 0: - print("Found: {}".format(found)) - show_error_notification(title="Triggered Dynamic World Info", text=found) - model.core_stopper.regeneration_required = True - return True - return False - - - old_get_stopping_criteria = transformers.GenerationMixin._get_stopping_criteria - - def new_get_stopping_criteria(self, *args, **kwargs): - global tokenizer - stopping_criteria = old_get_stopping_criteria(self, *args, **kwargs) - - self.core_stopper = CoreStopper() - self.kai_scanner = DynamicWorldInfoScanCriteria( - tokenizer=tokenizer, - excluded_world_info=self.kai_scanner_excluded_world_info, - ) - token_streamer = TokenStreamer(tokenizer=tokenizer) - - stopping_criteria.insert(0, ChatModeStopper(tokenizer=tokenizer)) - stopping_criteria.insert(0, self.kai_scanner) - token_streamer = TokenStreamer(tokenizer=tokenizer) - stopping_criteria.insert(0, token_streamer) - #This should be last - stopping_criteria.insert(0, self.core_stopper) - - return stopping_criteria - use_core_manipulations.get_stopping_criteria = new_get_stopping_criteria - def reset_model_settings(): koboldai_vars.reset_for_model_load() @@ -2574,14 +1885,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal tokenizer_id = requests.get( koboldai_vars.colaburl[:-8] + "/api/v1/model", ).json()["result"] - else: - tokenizer_id = { - "Colab": "EleutherAI/gpt-neo-2.7B", - "CLUSTER": koboldai_vars.cluster_requested_models[0] if len(koboldai_vars.cluster_requested_models) > 0 else "gpt2", - "OAI": "gpt2", - }[koboldai_vars.model] - # TODO: This should probably be a bit more robust of a check. + koboldai_vars.newlinemode = "n" if "xglm" in tokenizer_id: # Default to newline mode if using XGLM @@ -4416,7 +3721,7 @@ def apiactionsubmit_generate(txt, minimum, maximum): torch.cuda.empty_cache() # Submit input text to generator - _genout, already_generated = tpool.execute(core_generate, txt, minimum, maximum, set()) + _genout, already_generated = tpool.execute(model.core_generate, txt, minimum, maximum, set()) genout = [applyoutputformatting(utils.decodenewlines(tokenizer.decode(tokens[-already_generated:]))) for tokens in _genout] @@ -4441,7 +3746,7 @@ def apiactionsubmit_tpumtjgenerate(txt, minimum, maximum): koboldai_vars._prompt = koboldai_vars.prompt # Submit input text to generator - soft_tokens = tpumtjgetsofttokens() + soft_tokens = model.get_soft_tokens() genout = tpool.execute( tpu_mtj_backend.infer_static, np.uint32(txt), @@ -4828,855 +4133,10 @@ def calcsubmit(txt): # Send it! ikrequest(subtxt) -def core_generate(text: list, _min: int, _max: int, found_entries: set, is_core: bool = False): - # This generation function is tangled with koboldai_vars intentionally. It - # is meant for the story and nothing else. - - start_time = time.time() - gen_in = torch.tensor(text, dtype=torch.long)[None] - logger.debug("core_generate: torch.tensor time {}s".format(time.time()-start_time)) - - start_time = time.time() - if koboldai_vars.is_model_torch(): - # Torch stuff - if koboldai_vars.full_determinism: - torch.manual_seed(koboldai_vars.seed) - - if koboldai_vars.sp is not None: - soft_tokens = torch.arange( - model.config.vocab_size, - model.config.vocab_size + koboldai_vars.sp.shape[0], - ) - gen_in = torch.cat((soft_tokens[None], gen_in), dim=-1) - elif koboldai_vars.use_colab_tpu: - if koboldai_vars.full_determinism: - tpu_mtj_backend.set_rng_seed(koboldai_vars.seed) - - logger.debug("core_generate: Model Setup (SP, etc) time {}s".format(time.time()-start_time)) - - if gen_in.shape[-1] + koboldai_vars.genamt > koboldai_vars.max_length: - logger.error("gen_in.shape[-1]: {}".format(gen_in.shape[-1])) - logger.error("koboldai_vars.genamt: {}".format(koboldai_vars.genamt)) - logger.error("koboldai_vars.max_length: {}".format(koboldai_vars.max_length)) - assert gen_in.shape[-1] + koboldai_vars.genamt <= koboldai_vars.max_length - - start_time = time.time() - if koboldai_vars.hascuda and koboldai_vars.usegpu: - gen_in = gen_in.to(koboldai_vars.gpu_device) - elif koboldai_vars.hascuda and koboldai_vars.breakmodel: - gen_in = gen_in.to(breakmodel.primary_device) - else: - gen_in = gen_in.to("cpu") - - logger.debug("core_generate: gen_in to device time {}s".format(time.time()-start_time)) - start_time = time.time() - found_entries = found_entries or set() - - if model: - model.model.kai_scanner_excluded_world_info = found_entries - - koboldai_vars._prompt = koboldai_vars.prompt - - with torch.no_grad(): - already_generated = 0 - numseqs = koboldai_vars.numseqs - total_gens = None - - for i in range(koboldai_vars.numseqs if koboldai_vars.alt_multi_gen else 1): - while True: - # The reason this is a loop is due to how Dynamic WI works. We - # cannot simply add the WI to the context mid-generation, so we - # stop early, and then insert WI, then continue generating. That - # stopping and continuing is this loop. - - start_time = time.time() - result = raw_generate( - gen_in[0], - max_new=koboldai_vars.genamt, - do_streaming=koboldai_vars.output_streaming, - do_dynamic_wi=koboldai_vars.dynamicscan, - batch_count=numseqs if not koboldai_vars.alt_multi_gen else 1, - # Real max length is handled by CoreStopper. - bypass_hf_maxlength=koboldai_vars.dynamicscan, - is_core=True, - ) - logger.debug("core_generate: run raw_generate pass {} {}s".format(already_generated, time.time()-start_time)) - - genout = result.encoded - - already_generated += len(genout[0]) - - try: - assert already_generated <= koboldai_vars.genamt * koboldai_vars.numseqs if koboldai_vars.alt_multi_gen else 1 - except AssertionError: - print("AlreadyGenerated", already_generated) - print("genamt", koboldai_vars.genamt) - raise - - if result.is_whole_generation: - break - - # Generation stopped; why? - # If we have been told to halt, we have reached our target token - # amount (controlled by halt), or Dynamic WI has not told us to - # stop temporarily to insert WI, we can assume that we are done - # generating. We shall break. - if model.core_stopper.halt or not model.core_stopper.regeneration_required: - break - - # Now we are doing stuff for Dynamic WI. - assert genout.ndim >= 2 - assert genout.shape[0] == koboldai_vars.numseqs - - if(koboldai_vars.lua_koboldbridge.generated_cols and koboldai_vars.generated_tkns != koboldai_vars.lua_koboldbridge.generated_cols): - raise RuntimeError(f"Inconsistency detected between KoboldAI Python and Lua backends ({koboldai_vars.generated_tkns} != {koboldai_vars.lua_koboldbridge.generated_cols})") - - if(already_generated != koboldai_vars.generated_tkns): - print("already_generated: {}".format(already_generated)) - print("generated_tkns: {}".format(koboldai_vars.generated_tkns)) - raise RuntimeError("WI scanning error") - - for r in range(koboldai_vars.numseqs): - for c in range(already_generated): - assert koboldai_vars.lua_koboldbridge.generated[r+1][c+1] is not None - genout[r][genout.shape[-1] - already_generated + c] = koboldai_vars.lua_koboldbridge.generated[r+1][c+1] - - encoded = [] - - for i in range(koboldai_vars.numseqs): - txt = utils.decodenewlines(tokenizer.decode(genout[i, -already_generated:])) - #winfo, mem, anotetxt, _found_entries = calcsubmitbudgetheader(txt, force_use_txt=True, actions=koboldai_vars.actions) - #txt, _, _ = calcsubmitbudget(len(koboldai_vars.actions), winfo, mem, anotetxt, koboldai_vars.actions, submission=txt) - txt, _, _, _found_entries = koboldai_vars.calc_ai_text(submitted_text=txt, send_context=False) - found_entries[i].update(_found_entries) - encoded.append(torch.tensor(txt, dtype=torch.long, device=genout.device)) - - max_length = len(max(encoded, key=len)) - encoded = torch.stack(tuple(torch.nn.functional.pad(e, (max_length - len(e), 0), value=model.config.pad_token_id or model.config.eos_token_id) for e in encoded)) - genout = torch.cat( - ( - encoded, - genout[..., -already_generated:], - ), - dim=-1 - ) - - if(koboldai_vars.sp is not None): - soft_tokens = torch.arange( - model.config.vocab_size, - model.config.vocab_size + koboldai_vars.sp.shape[0], - device=genout.device, - ) - genout = torch.cat((soft_tokens.tile(koboldai_vars.numseqs, 1), genout), dim=-1) - assert genout.shape[-1] + koboldai_vars.genamt - already_generated <= koboldai_vars.max_length - gen_in = genout - numseqs = 1 - if total_gens is None: - total_gens = genout - else: - total_gens = torch.cat((total_gens, genout)) - - return total_gens, already_generated - -class GenerationResult: - def __init__( - self, - out_batches: list, - prompt: list, - - # Controls if generate() does it's looping thing. This should only be - # done for HF models that use that StoppingCondition - is_whole_generation: bool, - - # Controls if we should trim output by prompt length - output_includes_prompt: bool = False, - - # Lazy filter to cut off extra lines where we can't manipulate - # probabilities - single_line: bool = False, - ): - # Shave prompt off of encoded response when needed (HF). Decoded does - # not return prompt. - if output_includes_prompt: - self.encoded = out_batches[:, len(prompt):] - else: - self.encoded = out_batches - - self.prompt = prompt - self.is_whole_generation = is_whole_generation - - self.decoded = [utils.decodenewlines(tokenizer.decode(enc)) for enc in self.encoded] - - if single_line: - self.decoded = [x.split("\n", 1)[0] for x in self.decoded] - self.encoded = np.array(tokenizer(self.decoded).input_ids) - -class GenerationSettings: - def __init__(self, **overrides) -> None: - for setting in [ - "temp", - "top_p", - "top_k", - "tfs", - "typical", - "top_a", - "rep_pen", - "rep_pen_slope", - "rep_pen_range", - "sampler_order", - ]: - setattr( - self, - setting, - overrides.get(setting, getattr(koboldai_vars, setting)) - ) - -def get_auxilary_device(): - # NOTE: Does not include TPU! - if koboldai_vars.hascuda and koboldai_vars.usegpu: - return koboldai_vars.gpu_device - elif koboldai_vars.hascuda and koboldai_vars.breakmodel: - return breakmodel.primary_device - return "cpu" - -def raw_generate( - # prompt is either a string (text) or a list (token ids) - prompt: Union[str, list, np.ndarray], - max_new: int, - - do_streaming: bool = False, - do_dynamic_wi: bool = False, - batch_count: int = 1, - bypass_hf_maxlength: bool = False, - generation_settings: Optional[dict] = None, - is_core: bool = False, - single_line: bool = False, - found_entries: set = () -) -> GenerationResult: - # TODO: Support singleline outside of torch - - koboldai_vars.inference_config.do_core = is_core - gen_settings = GenerationSettings(*(generation_settings or {})) - - model_functions = { - "GooseAI": oai_raw_generate, - "OAI": oai_raw_generate, - "CLUSTER": cluster_raw_generate, - "Colab": colab_raw_generate, - "API": api_raw_generate, - } - - if isinstance(prompt, torch.Tensor): - prompt_tokens = prompt.cpu().numpy() - elif isinstance(prompt, list): - prompt_tokens = np.array(prompt) - elif isinstance(prompt, str): - prompt_tokens = np.array(tokenizer.encode(prompt)) - else: - raise ValueError(f"Prompt is {type(prompt)}. Not a fan!") - - assert isinstance(prompt_tokens, np.ndarray) - assert len(prompt_tokens.shape) == 1 - - if koboldai_vars.model == "ReadOnly": - raise NotImplementedError("No loaded model") - - result: GenerationResult - time_start = time.time() - - with use_core_manipulations(): - if koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): - batch_encoded = tpu_raw_generate( - prompt_tokens=prompt_tokens, - max_new=max_new, - batch_count=batch_count, - gen_settings=gen_settings - ) - result = GenerationResult( - out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True, single_line=single_line - ) - elif koboldai_vars.model in model_functions: - batch_encoded = model_functions[koboldai_vars.model]( - prompt_tokens=prompt_tokens, - max_new=max_new, - batch_count=batch_count, - gen_settings=gen_settings - ) - result = GenerationResult( - out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True, single_line=single_line - ) - elif koboldai_vars.model.startswith("RWKV"): - batch_encoded = rwkv_raw_generate( - prompt_tokens=prompt_tokens, - max_new=max_new, - batch_count=batch_count, - gen_settings=gen_settings - ) - result = GenerationResult( - out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True, output_includes_prompt=True, single_line=single_line - ) - else: - # Torch HF - start_time = time.time() - batch_encoded = torch_raw_generate( - prompt_tokens=prompt_tokens, - max_new=max_new if not bypass_hf_maxlength else int(2e9), - do_streaming=do_streaming, - do_dynamic_wi=do_dynamic_wi, - single_line=single_line, - batch_count=batch_count, - gen_settings=gen_settings, - ) - logger.debug("raw_generate: run torch_raw_generate {}s".format(time.time()-start_time)) - start_time = time.time() - result = GenerationResult( - out_batches=batch_encoded, - prompt=prompt_tokens, - is_whole_generation=False, - output_includes_prompt=True, - ) - logger.debug("raw_generate: run GenerationResult {}s".format(time.time()-start_time)) - - time_end = round(time.time() - time_start, 2) - tokens_per_second = round(len(result.encoded[0]) / time_end, 2) - - if not koboldai_vars.quiet: - logger.info(f"Generated {len(result.encoded[0])} tokens in {time_end} seconds, for an average rate of {tokens_per_second} tokens per second.") - - return result - -def tpu_raw_generate( - prompt_tokens: List[int], - max_new: int, - batch_count: int, - gen_settings: GenerationSettings -): - - # Mostly lifted from apiactionsubmit_tpumtjgenerate - soft_tokens = tpumtjgetsofttokens() - - genout = tpool.execute( - tpu_mtj_backend.infer_static, - np.uint32(prompt_tokens), - gen_len = max_new, - temp=gen_settings.temp, - top_p=gen_settings.top_p, - top_k=gen_settings.top_k, - tfs=gen_settings.tfs, - typical=gen_settings.typical, - top_a=gen_settings.top_a, - numseqs=batch_count, - repetition_penalty=gen_settings.rep_pen, - rpslope=gen_settings.rep_pen_slope, - rprange=gen_settings.rep_pen_range, - soft_embeddings=koboldai_vars.sp, - soft_tokens=soft_tokens, - sampler_order=gen_settings.sampler_order, - ) - genout = np.array(genout) - - return genout - -def torch_raw_generate( - prompt_tokens: Union[List[int], torch.Tensor], - max_new: int, - gen_settings: GenerationSettings, - - do_streaming: bool = False, - do_dynamic_wi: bool = False, - single_line: bool = False, - batch_count: int = 1, -): - start_time = time.time() - koboldai_vars.inference_config.do_streaming = do_streaming - koboldai_vars.inference_config.do_dynamic_wi = do_dynamic_wi - - # Dynamic WI depends on this!!! This is a main gen call. - koboldai_vars.inference_config.stop_at_genamt = do_dynamic_wi - - # Makes stopping criteria hook happy - try: - model.kai_scanner_excluded_world_info = model.kai_scanner_excluded_world_info - except AttributeError: - model.kai_scanner_excluded_world_info = set() - - logger.debug("torch_raw_generate: setup inference_config {}s".format(time.time()-start_time)) - - if not isinstance(prompt_tokens, torch.Tensor): - gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] - else: - gen_in = prompt_tokens - - device = get_auxilary_device() - gen_in = gen_in.to(device) - - additional_bad_words_ids = [tokenizer.encode("\n")] if single_line else [] - - with torch.no_grad(): - start_time = time.time() - # HACK: raw_generate functions should be in the model itself - genout = model.model.generate( - gen_in, - do_sample=True, - max_length=min(len(prompt_tokens) + max_new, koboldai_vars.max_length), - repetition_penalty=1.0, - bad_words_ids=koboldai_vars.badwordsids + additional_bad_words_ids, - use_cache=True, - num_return_sequences=batch_count, - ) - logger.debug("torch_raw_generate: run generator {}s".format(time.time()-start_time)) - - return genout - -def oai_raw_generate( - prompt_tokens: List[int], - max_new: int, - batch_count: int, - gen_settings: GenerationSettings, -): - # Taken mainly from oairequest() - - decoded_prompt = utils.decodenewlines(tokenizer.decode(prompt_tokens)) - - # Log request to console - if not koboldai_vars.quiet: - print("{0}Len:{1}, Txt:{2}{3}".format(colors.YELLOW, len(decoded_prompt), decoded_prompt, colors.END)) - - # Store context in memory to use it for comparison with generated content - koboldai_vars.lastctx = decoded_prompt - - # Build request JSON data - # GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround - # as the koboldai_vars.model will always be OAI - if 'GooseAI' in koboldai_vars.configname: - reqdata = { - 'prompt': decoded_prompt, - 'max_tokens': max_new, - 'temperature': gen_settings.temp, - 'top_a': gen_settings.top_a, - 'top_p': gen_settings.top_p, - 'top_k': gen_settings.top_k, - 'tfs': gen_settings.tfs, - 'typical_p': gen_settings.typical, - 'repetition_penalty': gen_settings.rep_pen, - 'repetition_penalty_slope': gen_settings.rep_pen_slope, - 'repetition_penalty_range': gen_settings.rep_pen_range, - 'n': batch_count, - # TODO: Implement streaming - 'stream': False - } - else: - reqdata = { - 'prompt': decoded_prompt, - 'max_tokens': max_new, - 'temperature': gen_settings.temp, - 'top_p': gen_settings.top_p, - 'n': batch_count, - 'stream': False - } - - req = requests.post( - koboldai_vars.oaiurl, - json = reqdata, - headers = { - 'Authorization': 'Bearer '+koboldai_vars.oaiapikey, - 'Content-Type': 'application/json' - } - ) - - j = req.json() - # Deal with the response - if req.ok: - outputs = [out["text"] for out in j["choices"]] - - if not koboldai_vars.quiet: - print("{0}{1}{2}".format(colors.CYAN, outputs, colors.END)) - - return np.array([tokenizer.encode(x) for x in outputs]) - else: - # Send error message to web client - if "error" in j: - error_type = j["error"]["type"] - error_message = j["error"]["message"] - else: - error_type = "Unknown" - error_message = "Unknown" - - emit('from_server', { - 'cmd': 'errmsg', - 'data': f"OpenAI API Error: {error_type} - {error_message}" - }, broadcast=True, room="UI_1") - set_aibusy(0) - return [] class HordeException(Exception): pass -def cluster_raw_generate( - prompt_tokens: List[int], - max_new: int, - batch_count: int, - gen_settings: GenerationSettings, -): - decoded_prompt = utils.decodenewlines(tokenizer.decode(prompt_tokens)) - - # Store context in memory to use it for comparison with generated content - koboldai_vars.lastctx = decoded_prompt - - # Build request JSON data - reqdata = { - 'max_length': max_new, - 'max_context_length': koboldai_vars.max_length, - 'rep_pen': gen_settings.rep_pen, - 'rep_pen_slope': gen_settings.rep_pen_slope, - 'rep_pen_range': gen_settings.rep_pen_range, - 'temperature': gen_settings.temp, - 'top_p': gen_settings.top_p, - 'top_k': int(gen_settings.top_k), - 'top_a': gen_settings.top_a, - 'tfs': gen_settings.tfs, - 'typical': gen_settings.typical, - 'n': batch_count, - } - - cluster_metadata = { - 'prompt': decoded_prompt, - 'params': reqdata, - 'models': [x for x in koboldai_vars.cluster_requested_models if x], - 'trusted_workers': False, - } - - cluster_headers = {'apikey': koboldai_vars.apikey} - - try: - # Create request - req = requests.post( - koboldai_vars.colaburl[:-8] + "/api/v2/generate/async", - json=cluster_metadata, - headers=cluster_headers - ) - except requests.exceptions.ConnectionError: - errmsg = f"Horde unavailable. Please try again later" - logger.error(errmsg) - raise HordeException(errmsg) - - if req.status_code == 503: - errmsg = f"KoboldAI API Error: No available KoboldAI servers found in Horde to fulfil this request using the selected models or other properties." - logger.error(errmsg) - raise HordeException(errmsg) - elif not req.ok: - errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." - logger.error(errmsg) - logger.error(f"HTTP {req.status_code}!!!") - logger.error(req.text) - raise HordeException(errmsg) - - try: - js = req.json() - except requests.exceptions.JSONDecodeError: - errmsg = f"Unexpected message received from the Horde: '{req.text}'" - logger.error(errmsg) - raise HordeException(errmsg) - - request_id = js["id"] - logger.debug("Horde Request ID: {}".format(request_id)) - - # We've sent the request and got the ID back, now we need to watch it to see when it finishes - finished = False - - while not finished: - try: - req = requests.get(koboldai_vars.colaburl[:-8] + "/api/v1/generate/check/" + request_id) - except requests.exceptions.ConnectionError: - errmsg = f"Horde unavailable. Please try again later" - logger.error(errmsg) - raise HordeException(errmsg) - - if not req.ok: - errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." - logger.error(req.text) - raise HordeException(errmsg) - - try: - js = req.json() - except requests.exceptions.JSONDecodeError: - errmsg = f"Unexpected message received from the KoboldAI Horde: '{req.text}'" - logger.error(errmsg) - raise HordeException(errmsg) - - if "done" not in js: - errmsg = f"Unexpected response received from the KoboldAI Horde: '{js}'" - logger.error(errmsg ) - raise HordeException(errmsg) - - finished = js["done"] - koboldai_vars.horde_wait_time = js["wait_time"] - koboldai_vars.horde_queue_position = js["queue_position"] - koboldai_vars.horde_queue_size = js["waiting"] - - if not finished: - logger.debug(js) - time.sleep(1) - - logger.debug("Last Horde Status Message: {}".format(js)) - js = requests.get(koboldai_vars.colaburl[:-8] + "/api/v1/generate/prompt/" + request_id).json()['generations'] - logger.debug("Horde Result: {}".format(js)) - - gen_servers = [(cgen['server_name'],cgen['server_id']) for cgen in js] - logger.info(f"Generations by: {gen_servers}") - - # TODO: Fix this, using tpool so it's a context error - # Just in case we want to announce it to the user - # if len(js) == 1: - # warnmsg = f"Text generated by {js[0]['server_name']}" - # emit('from_server', {'cmd': 'warnmsg', 'data': warnmsg}, broadcast=True) - - return np.array([tokenizer.encode(cgen["text"]) for cgen in js]) - -def colab_raw_generate( - prompt_tokens: List[int], - max_new: int, - batch_count: int, - gen_settings: GenerationSettings, -): - decoded_prompt = utils.decodenewlines(tokenizer.decode(prompt_tokens)) - - # Store context in memory to use it for comparison with generated content - koboldai_vars.lastctx = decoded_prompt - - # Build request JSON data - reqdata = { - 'text': decoded_prompt, - 'min': 0, - 'max': max_new, - 'rep_pen': gen_settings.rep_pen, - 'rep_pen_slope': gen_settings.rep_pen_slope, - 'rep_pen_range': gen_settings.rep_pen_range, - 'temperature': gen_settings.temp, - 'top_p': gen_settings.top_p, - 'top_k': gen_settings.top_k, - 'tfs': gen_settings.tfs, - 'typical': gen_settings.typical, - 'topa': gen_settings.top_a, - 'numseqs': batch_count, - 'retfultxt': False - } - - # Create request - req = requests.post( - koboldai_vars.colaburl, - json = reqdata - ) - - # Deal with the response - if(req.status_code == 200): - js = req.json()["data"] - - # Try to be backwards compatible with outdated colab - if("text" in js): - genout = [getnewcontent(js["text"])] - else: - genout = js["seqs"] - - return np.array([tokenizer.encode(x) for x in genout]) - -def api_raw_generate( - prompt_tokens: List[int], - max_new: int, - batch_count: int, - gen_settings: GenerationSettings, -): - decoded_prompt = utils.decodenewlines(tokenizer.decode(prompt_tokens)) - - # Store context in memory to use it for comparison with generated content - koboldai_vars.lastctx = decoded_prompt - - # Build request JSON data - reqdata = { - 'prompt': decoded_prompt, - 'max_length': max_new, - 'max_context_length': koboldai_vars.max_length, - 'rep_pen': gen_settings.rep_pen, - 'rep_pen_slope': gen_settings.rep_pen_slope, - 'rep_pen_range': gen_settings.rep_pen_range, - 'temperature': gen_settings.temp, - 'top_p': gen_settings.top_p, - 'top_k': gen_settings.top_k, - 'top_a': gen_settings.top_a, - 'tfs': gen_settings.tfs, - 'typical': gen_settings.typical, - 'n': batch_count, - } - - # Create request - while True: - req = requests.post( - koboldai_vars.colaburl[:-8] + "/api/v1/generate", - json=reqdata, - ) - if(req.status_code == 503): # Server is currently generating something else so poll until it's our turn - time.sleep(1) - continue - js = req.json() - if(req.status_code != 200): - errmsg = "KoboldAI API Error: Failed to get a reply from the server. Please check the console." - print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - emit("error", errmsg, broadcast=True, room="UI_2") - set_aibusy(0) - return - - genout = [obj["text"] for obj in js["results"]] - return np.array([tokenizer.encode(x) for x in genout]) - -def rwkv_raw_generate( - prompt_tokens: List[int], - max_new: int, - batch_count: int, - gen_settings: GenerationSettings, -): - import types - - model.clear() - context = list(prompt_tokens) - - input_length = len(prompt_tokens) - - # TODO: Not needed every run? I think this is creating that huge wait time - # between generations. - init_state = types.SimpleNamespace() - for i in range(input_length): - x = context[:i+1] - if i == input_length - 1: - init_state.out = model.run(x) - else: - model.run(x) - model.save(init_state) - - for ni, i in enumerate(range(input_length, input_length + max_new)): - x = context[:i+1] - x = x[-model.ctx_len:] - - if i == input_length: - out = copy.deepcopy(init_state.out) - else: - out = model.run(x) - - # Don't generate EOS - out[0] = -9999999 - - char = tokenizer.sample_logits( - out=out, - x=x, - ctx_len=model.ctx_len, - temperature=gen_settings.temp, - top_p=gen_settings.top_p, - ) - char = char.item() - context.append(char) - - if koboldai_vars.output_streaming: - koboldai_vars.actions.stream_tokens([utils.decodenewlines(tokenizer.decode(char))]) - - # HACK - if ni > max_new: - break - - return np.array([context]) - - -@dataclass -class RWKVConfig: - n_layer: int - n_embed: int - ctx_len: int - -def rwkv_init(model_class: str, use_gpu: bool = False): - torch.backends.cudnn.allow_tf32 = True - torch.backends.cuda.matmul.allow_tf32 = True - os.environ["RWKV_FLOAT_MODE"] = "bf16" - - logger.info("[RWKV] RWKV support is in super-duper-uber-schmoober alpha and will ignore many options.") - - device = "cpu" - - if use_gpu: - logger.warning("[RWKV] Using GPU. This may not work out of the box and may require significant setup.") - device = "cuda" - - os.environ["RWKV_RUN_DEVICE"] = device - - TOKENIZER_PATH = "RWKV4/20B_tokenizer.json" - MODEL_DIR = "models" - - model_files = os.listdir(MODEL_DIR) - matching_models = [f for f in model_files if f.startswith(f"RWKV-4-{model_class}")] - - if not matching_models: - raise RuntimeError( - f"No models of class '{model_class}' found in '{MODEL_DIR}'. Please download a model from " \ - "https://huggingface.co/BlinkDL, rename the .pth file to 'model.pth', and place the it in a directory named "\ - "'{MODEL_DIR}/RWKV-4-XYZ', where XYZ is the parameter string of the model (169M, 430M, 1B5, 3B, or 7B)." - ) - model_path = os.path.join(MODEL_DIR, sorted(matching_models)[-1], "model.pth") - - model_config = { - "169M": RWKVConfig(n_layer=12, n_embed=768, ctx_len=1024), - "430M": RWKVConfig(n_layer=24, n_embed=1024, ctx_len=1024), - "1B5": RWKVConfig(n_layer=24, n_embed=2048, ctx_len=1024), - "3B": RWKVConfig(n_layer=32, n_embed=2560, ctx_len=1024), - "7B": RWKVConfig(n_layer=32, n_embed=4096, ctx_len=1024), - }.get(model_class) - - if not model_config: - raise RuntimeError(f"No config for model '{model_class}' found!") - - if not os.path.exists(TOKENIZER_PATH): - raise RuntimeError( - f"Can't find tokenizer at '{TOKENIZER_PATH}'! Please download it from " \ - f"https://raw.githubusercontent.com/BlinkDL/RWKV-LM/main/RWKV-v4/20B_tokenizer.json and place it at '{TOKENIZER_PATH}" - ) - - # Model stuff - from RWKV4.src.model_run import RWKV_RNN - from transformers import PreTrainedTokenizerFast - from torch.nn import functional as F - - model = RWKV_RNN( - model_path.split(".")[0], - device, - "RWKV", - model_config.n_layer, - model_config.n_embed, - model_config.ctx_len, - ) - tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_PATH) - - # We'll just patch tokenizer ourselves to make it easier - def _sample_logits(self, out, x, ctx_len, temperature, top_p): - last_char = int(x[-1]) - probs = F.softmax(torch.tensor(out), dim=-1) - sorted_probs, s_index = torch.sort(probs, descending=True) - - cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy() - cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)]) - - probs[probs < cutoff] = 0 - - if temperature != 1.0: - probs = probs.pow(1.0 / temperature) - - return torch.multinomial(probs, num_samples=1)[0] - - tokenizer.sample_logits = _sample_logits.__get__(tokenizer, AutoTokenizer) - - tokenizer._koboldai_header = [] - tokenizer.add_bos_token = False - tokenizer.add_prefix_space = False - - logger.info("[RWKV] Loaded :^)") - return model, tokenizer - #==================================================================# # Send text to generator and deal with output #==================================================================# @@ -5703,7 +4163,7 @@ def generate(txt, minimum, maximum, found_entries=None): # Submit input text to generator try: start_time = time.time() - genout, already_generated = tpool.execute(core_generate, txt, minimum, maximum, found_entries) + genout, already_generated = tpool.execute(model.core_generate, txt, minimum, maximum, found_entries) logger.debug("Generate: core_generate time {}s".format(time.time()-start_time)) except Exception as e: if(issubclass(type(e), lupa.LuaError)): @@ -7340,8 +5800,8 @@ def final_startup(): file.close() # Precompile TPU backend if required - if(koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")): - soft_tokens = tpumtjgetsofttokens() + if isinstance(model, HFMTJInferenceModel): + soft_tokens = model.get_soft_tokens() if(koboldai_vars.dynamicscan or (not koboldai_vars.nogenmod and koboldai_vars.has_genmod)): tpool.execute(tpu_mtj_backend.infer_dynamic, np.tile(np.uint32((23403, 727, 20185)), (koboldai_vars.numseqs, 1)), soft_embeddings= koboldai_vars.sp, diff --git a/model.py b/model.py index 3acb458d..1e43976f 100644 --- a/model.py +++ b/model.py @@ -1,11 +1,14 @@ # TODO: # - Intertwine stoppers and streaming and such -# - Add raw_generate functions to this -# - Support TPU -# - Support APIs # - Support RWKV +# - Support 8-bit lazy-load +from __future__ import annotations import bisect +import copy +import requests +from dataclasses import dataclass +from eventlet import tpool import gc import shutil import contextlib @@ -13,30 +16,1174 @@ import functools import itertools import json import os +import time import traceback +from typing import Dict, Iterable, List, Optional, Union import zipfile -import utils -import breakmodel - -import torch -from torch.nn import Embedding - from tqdm.auto import tqdm from logger import logger import torch_lazy_loader -from typing import Dict, List, Optional, Union -from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, modeling_utils, AutoModelForTokenClassification, AutoConfig -# Previously under condition HAS_ACCELERATE, but I'm quite sure accelerate -# is now a dependency. +import torch +from torch.nn import Embedding +import numpy as np import accelerate.utils +import transformers +from transformers import ( + StoppingCriteria, + GPT2Tokenizer, + GPT2LMHeadModel, + GPTNeoForCausalLM, + GPTNeoModel, + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + PreTrainedModel, + modeling_utils, + AutoModelForTokenClassification, + AutoConfig, +) +import utils +import breakmodel import koboldai_settings +try: + import tpu_mtj_backend +except ModuleNotFoundError as e: + # Not on TPU... hopefully + if utils.koboldai_vars.use_colab_tpu: + raise e + + +class OpenAIAPIError(Exception): + def __init__(self, error_type: str, error_message) -> None: + super().__init__(f"{error_type}: {error_message}") + + +class HordeException(Exception): + pass + + +class ColabException(Exception): + pass + + +class APIException(Exception): + pass + + +class GenerationSettings: + def __init__(self, **overrides) -> None: + for setting in [ + "temp", + "top_p", + "top_k", + "tfs", + "typical", + "top_a", + "rep_pen", + "rep_pen_slope", + "rep_pen_range", + "sampler_order", + ]: + setattr( + self, + setting, + overrides.get(setting, getattr(utils.koboldai_vars, setting)), + ) + + +# We only want to use logit manipulations and such on our core text model +class use_core_manipulations: + # These must be set by wherever they get setup + get_logits_processor: callable + sample: callable + get_stopping_criteria: callable + + # We set these automatically + old_get_logits_processor: callable + old_sample: callable + old_get_stopping_criteria: callable + + def __enter__(self): + use_core_manipulations.old_get_logits_processor = ( + transformers.GenerationMixin._get_logits_processor + ) + transformers.GenerationMixin._get_logits_processor = ( + use_core_manipulations.get_logits_processor + ) + + use_core_manipulations.old_sample = transformers.GenerationMixin.sample + transformers.GenerationMixin.sample = use_core_manipulations.sample + + use_core_manipulations.old_get_stopping_criteria = ( + transformers.GenerationMixin._get_stopping_criteria + ) + transformers.GenerationMixin._get_stopping_criteria = ( + use_core_manipulations.get_stopping_criteria + ) + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + transformers.GenerationMixin._get_logits_processor = ( + use_core_manipulations.old_get_logits_processor + ) + transformers.GenerationMixin.sample = use_core_manipulations.old_sample + transformers.GenerationMixin._get_stopping_criteria = ( + use_core_manipulations.old_get_stopping_criteria + ) + + +def patch_transformers_download(): + def http_get( + url: str, + temp_file, + proxies=None, + resume_size=0, + headers=None, + file_name=None, + ): + """ + Download remote file. Do not gobble up errors. + """ + headers = copy.deepcopy(headers) + if resume_size > 0: + headers["Range"] = f"bytes={resume_size}-" + r = requests.get(url, stream=True, proxies=proxies, headers=headers) + transformers.utils.hub._raise_for_status(r) + content_length = r.headers.get("Content-Length") + total = ( + resume_size + int(content_length) if content_length is not None else None + ) + # `tqdm` behavior is determined by `utils.logging.is_progress_bar_enabled()` + # and can be set using `utils.logging.enable/disable_progress_bar()` + if url[-11:] != "config.json": + progress = tqdm.tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + total=total, + initial=resume_size, + desc=f"Downloading {file_name}" + if file_name is not None + else "Downloading", + file=utils.UIProgressBarFile, + ) + utils.koboldai_vars.status_message = "Download Model" + utils.koboldai_vars.total_download_chunks = total + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + if url[-11:] != "config.json": + progress.update(len(chunk)) + utils.koboldai_vars.downloaded_chunks += len(chunk) + temp_file.write(chunk) + if url[-11:] != "config.json": + progress.close() + + utils.koboldai_vars.status_message = "" + + transformers.utils.hub.http_get = http_get + + +def patch_transformers(): + # ????? why is this needed + global transformers + print(transformers) + patch_transformers_download() + + old_from_pretrained = PreTrainedModel.from_pretrained.__func__ + + @classmethod + def new_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + utils.koboldai_vars.fp32_model = False + utils.num_shards = None + utils.current_shard = 0 + utils.from_pretrained_model_name = pretrained_model_name_or_path + utils.from_pretrained_index_filename = None + utils.from_pretrained_kwargs = kwargs + utils.bar = None + if not utils.args.no_aria2: + utils.aria2_hook(pretrained_model_name_or_path, **kwargs) + return old_from_pretrained( + cls, pretrained_model_name_or_path, *model_args, **kwargs + ) + + if not hasattr(PreTrainedModel, "_kai_patched"): + PreTrainedModel.from_pretrained = new_from_pretrained + PreTrainedModel._kai_patched = True + if hasattr(modeling_utils, "get_checkpoint_shard_files"): + old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files + + def new_get_checkpoint_shard_files( + pretrained_model_name_or_path, index_filename, *args, **kwargs + ): + utils.num_shards = utils.get_num_shards(index_filename) + utils.from_pretrained_index_filename = index_filename + return old_get_checkpoint_shard_files( + pretrained_model_name_or_path, index_filename, *args, **kwargs + ) + + modeling_utils.get_checkpoint_shard_files = new_get_checkpoint_shard_files + + # Patch transformers to use our custom logit warpers + from transformers import ( + LogitsProcessorList, + LogitsWarper, + LogitsProcessor, + TopKLogitsWarper, + TopPLogitsWarper, + TemperatureLogitsWarper, + RepetitionPenaltyLogitsProcessor, + ) + from warpers import ( + AdvancedRepetitionPenaltyLogitsProcessor, + TailFreeLogitsWarper, + TypicalLogitsWarper, + TopALogitsWarper, + ) + + def dynamic_processor_wrap(cls, field_name, var_name, cond=None): + old_call = cls.__call__ + + def new_call(self, *args, **kwargs): + if not isinstance(field_name, str) and isinstance(field_name, Iterable): + conds = [] + for f, v in zip(field_name, var_name): + conds.append(getattr(utils.koboldai_vars, v)) + setattr(self, f, conds[-1]) + else: + conds = getattr(utils.koboldai_vars, var_name) + setattr(self, field_name, conds) + assert len(args) == 2 + if cond is None or cond(conds): + return old_call(self, *args, **kwargs) + return args[1] + + cls.__call__ = new_call + + dynamic_processor_wrap( + AdvancedRepetitionPenaltyLogitsProcessor, + ("penalty", "penalty_slope", "penalty_range", "use_alt_rep_pen"), + ("rep_pen", "rep_pen_slope", "rep_pen_range", "use_alt_rep_pen"), + cond=lambda x: x[0] != 1.0, + ) + dynamic_processor_wrap(TopKLogitsWarper, "top_k", "top_k", cond=lambda x: x > 0) + dynamic_processor_wrap(TopALogitsWarper, "top_a", "top_a", cond=lambda x: x > 0.0) + dynamic_processor_wrap(TopPLogitsWarper, "top_p", "top_p", cond=lambda x: x < 1.0) + dynamic_processor_wrap(TailFreeLogitsWarper, "tfs", "tfs", cond=lambda x: x < 1.0) + dynamic_processor_wrap( + TypicalLogitsWarper, "typical", "typical", cond=lambda x: x < 1.0 + ) + dynamic_processor_wrap( + TemperatureLogitsWarper, "temperature", "temp", cond=lambda x: x != 1.0 + ) + + class PhraseBiasLogitsProcessor(LogitsProcessor): + def __init__(self): + pass + + def _find_intersection(self, big: List, small: List) -> int: + """Find the maximum overlap between the beginning of small and the end of big. + Return the index of the token in small following the overlap, or 0. + + big: The tokens in the context (as a tensor) + small: The tokens in the phrase to bias (as a list) + + Both big and small are in "oldest to newest" order. + """ + # There are asymptotically more efficient methods for determining the overlap, + # but typically there will be few (0-1) instances of small[0] in the last len(small) + # elements of big, plus small will typically be fairly short. So this naive + # approach is acceptable despite O(N^2) worst case performance. + + num_small = len(small) + # The small list can only ever match against at most num_small tokens of big, + # so create a slice. Typically, this slice will be as long as small, but it + # may be shorter if the story has just started. + # We need to convert the big slice to list, since natively big is a tensor + # and tensor and list don't ever compare equal. It's better to convert here + # and then use native equality tests than to iterate repeatedly later. + big_slice = list(big[-num_small:]) + + # It's possible that the start token appears multiple times in small + # For example, consider the phrase: + # [ fair is foul, and foul is fair, hover through the fog and filthy air] + # If we merely look for the first instance of [ fair], then we would + # generate the following output: + # " fair is foul, and foul is fair is foul, and foul is fair..." + start = small[0] + for i, t in enumerate(big_slice): + # Strictly unnecessary, but it's marginally faster to test the first + # token before creating slices to test for a full match. + if t == start: + remaining = len(big_slice) - i + if big_slice[i:] == small[:remaining]: + # We found a match. If the small phrase has any remaining tokens + # then return the index of the next token. + if remaining < num_small: + return remaining + # In this case, the entire small phrase matched, so start over. + return 0 + + # There were no matches, so just begin at the beginning. + return 0 + + def _allow_leftwards_tampering(self, phrase: str) -> bool: + """Determines if a phrase should be tampered with from the left in + the "soft" token encoding mode.""" + + if phrase[0] in [".", "?", "!", ";", ":", "\n"]: + return False + return True + + def _get_token_sequence(self, phrase: str) -> List[List]: + """Convert the phrase string into a list of encoded biases, each + one being a list of tokens. How this is done is determined by the + phrase's format: + + - If the phrase is surrounded by square brackets ([]), the tokens + will be the phrase split by commas (,). If a "token" isn't + actually a number, it will be skipped. NOTE: Tokens output by + this may not be in the model's vocabulary, and such tokens + should be ignored later in the pipeline. + - If the phrase is surrounded by curly brackets ({}), the phrase + will be directly encoded with no synonym biases and no fancy + tricks. + - Otherwise, the phrase will be encoded, with close deviations + being included as synonym biases. + """ + + # TODO: Cache these tokens, invalidate when model or bias is + # changed. + + # Handle direct token id input + if phrase.startswith("[") and phrase.endswith("]"): + no_brackets = phrase[1:-1] + ret = [] + for token_id in no_brackets.split(","): + try: + ret.append(int(token_id)) + except ValueError: + # Ignore non-numbers. Rascals! + pass + return [ret] + + # Handle direct phrases + if phrase.startswith("{") and phrase.endswith("}"): + no_brackets = phrase[1:-1] + return [tokenizer.encode(no_brackets)] + + # Handle untamperable phrases + if not self._allow_leftwards_tampering(phrase): + return [tokenizer.encode(phrase)] + + # Handle slight alterations to original phrase + phrase = phrase.strip(" ") + ret = [] + + for alt_phrase in [phrase, f" {phrase}"]: + ret.append(tokenizer.encode(alt_phrase)) + + return ret + + def _get_biased_tokens(self, input_ids: List) -> Dict: + # TODO: Different "bias slopes"? + + ret = {} + for phrase, _bias in utils.koboldai_vars.biases.items(): + bias_score, completion_threshold = _bias + token_seqs = self._get_token_sequence(phrase) + variant_deltas = {} + for token_seq in token_seqs: + bias_index = self._find_intersection(input_ids, token_seq) + + # Ensure completion after completion_threshold tokens + # Only provide a positive bias when the base bias score is positive. + if bias_score > 0 and bias_index + 1 > completion_threshold: + bias_score = 999 + + token_to_bias = token_seq[bias_index] + variant_deltas[token_to_bias] = bias_score + + # If multiple phrases bias the same token, add the modifiers + # together. This should NOT be applied to automatic variants + for token_to_bias, bias_score in variant_deltas.items(): + if token_to_bias in ret: + ret[token_to_bias] += bias_score + else: + ret[token_to_bias] = bias_score + return ret + + def __call__( + self, input_ids: torch.LongTensor, scores: torch.FloatTensor + ) -> torch.FloatTensor: + assert scores.ndim == 2 + assert input_ids.ndim == 2 + + scores_shape = scores.shape + + for batch in range(scores_shape[0]): + for token, bias in self._get_biased_tokens(input_ids[batch]).items(): + scores[batch][token] += bias + + return scores + + class LuaLogitsProcessor(LogitsProcessor): + def __init__(self): + pass + + def __call__( + self, input_ids: torch.LongTensor, scores: torch.FloatTensor + ) -> torch.FloatTensor: + assert scores.ndim == 2 + assert input_ids.ndim == 2 + self.regeneration_required = False + self.halt = False + + if utils.koboldai_vars.standalone: + return scores + + scores_shape = scores.shape + scores_list = scores.tolist() + utils.koboldai_vars.lua_koboldbridge.logits = utils.koboldai_vars.lua_state.table() + for r, row in enumerate(scores_list): + utils.koboldai_vars.lua_koboldbridge.logits[ + r + 1 + ] = utils.koboldai_vars.lua_state.table(*row) + utils.koboldai_vars.lua_koboldbridge.vocab_size = scores_shape[-1] + + utils.koboldai_vars.lua_koboldbridge.execute_genmod() + + scores = torch.tensor( + tuple( + tuple(row.values()) + for row in utils.koboldai_vars.lua_koboldbridge.logits.values() + ), + device=scores.device, + dtype=scores.dtype, + ) + assert scores.shape == scores_shape + + return scores + + from torch.nn import functional as F + + def visualize_probabilities(scores: torch.FloatTensor) -> None: + assert scores.ndim == 2 + + if utils.koboldai_vars.numseqs > 1 or not utils.koboldai_vars.show_probs: + return + + if not utils.koboldai_vars.show_probs: + return scores + + option_offset = 0 + if utils.koboldai_vars.actions.action_count + 1 in utils.koboldai_vars.actions.actions: + for x in range( + len( + utils.koboldai_vars.actions.actions[ + utils.koboldai_vars.actions.action_count + 1 + ]["Options"] + ) + ): + option = utils.koboldai_vars.actions.actions[ + utils.koboldai_vars.actions.action_count + 1 + ]["Options"][x] + if option["Pinned"] or option["Previous Selection"] or option["Edited"]: + option_offset = x + 1 + batch_offset = ( + int((utils.koboldai_vars.generated_tkns - 1) / utils.koboldai_vars.genamt) + if utils.koboldai_vars.alt_multi_gen + else 0 + ) + for batch_index, batch in enumerate(scores): + probs = F.softmax(batch, dim=-1).cpu().numpy() + + token_prob_info = [] + for token_id, score in sorted( + enumerate(probs), key=lambda x: x[1], reverse=True + )[:8]: + token_prob_info.append( + { + "tokenId": token_id, + "decoded": utils.decodenewlines(tokenizer.decode(token_id)), + "score": float(score), + } + ) + + if utils.koboldai_vars.numseqs == 1: + utils.koboldai_vars.actions.set_probabilities(token_prob_info) + else: + utils.koboldai_vars.actions.set_option_probabilities( + token_prob_info, batch_index + option_offset + batch_offset + ) + + return scores + + def new_get_logits_processor(*args, **kwargs) -> LogitsProcessorList: + processors = new_get_logits_processor.old_get_logits_processor(*args, **kwargs) + processors.insert(0, LuaLogitsProcessor()) + processors.append(PhraseBiasLogitsProcessor()) + return processors + + use_core_manipulations.get_logits_processor = new_get_logits_processor + new_get_logits_processor.old_get_logits_processor = ( + transformers.GenerationMixin._get_logits_processor + ) + + class KoboldLogitsWarperList(LogitsProcessorList): + def __init__(self, beams: int = 1, **kwargs): + self.__warper_list: List[LogitsWarper] = [] + self.__warper_list.append( + TopKLogitsWarper(top_k=1, min_tokens_to_keep=1 + (beams > 1)) + ) + self.__warper_list.append( + TopALogitsWarper(top_a=0.5, min_tokens_to_keep=1 + (beams > 1)) + ) + self.__warper_list.append( + TopPLogitsWarper(top_p=0.5, min_tokens_to_keep=1 + (beams > 1)) + ) + self.__warper_list.append( + TailFreeLogitsWarper(tfs=0.5, min_tokens_to_keep=1 + (beams > 1)) + ) + self.__warper_list.append( + TypicalLogitsWarper(typical=0.5, min_tokens_to_keep=1 + (beams > 1)) + ) + self.__warper_list.append(TemperatureLogitsWarper(temperature=0.5)) + self.__warper_list.append(AdvancedRepetitionPenaltyLogitsProcessor()) + + def __call__( + self, + input_ids: torch.LongTensor, + scores: torch.FloatTensor, + *args, + **kwargs, + ): + sampler_order = utils.koboldai_vars.sampler_order[:] + if ( + len(sampler_order) < 7 + ): # Add repetition penalty at beginning if it's not present + sampler_order = [6] + sampler_order + for k in sampler_order: + scores = self.__warper_list[k](input_ids, scores, *args, **kwargs) + visualize_probabilities(scores) + return scores + + def new_get_logits_warper( + beams: int = 1, + ) -> LogitsProcessorList: + return KoboldLogitsWarperList(beams=beams) + + def new_sample(self, *args, **kwargs): + assert kwargs.pop("logits_warper", None) is not None + kwargs["logits_warper"] = new_get_logits_warper( + beams=1, + ) + if (utils.koboldai_vars.newlinemode == "s") or (utils.koboldai_vars.newlinemode == "ns"): + kwargs["eos_token_id"] = -1 + kwargs.setdefault("pad_token_id", 2) + return new_sample.old_sample(self, *args, **kwargs) + + new_sample.old_sample = transformers.GenerationMixin.sample + use_core_manipulations.sample = new_sample + + # Allow bad words filter to ban <|endoftext|> token + import transformers.generation.logits_process + + def new_init(self, bad_words_ids: List[List[int]], eos_token_id: int): + return new_init.old_init(self, bad_words_ids, -1) + + new_init.old_init = ( + transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ + ) + transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ = new_init + + class TokenStreamer(StoppingCriteria): + # A StoppingCriteria is used here because it seems to run after + # everything has been evaluated score-wise. + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def __call__( + self, + input_ids: torch.LongTensor, + scores: torch.FloatTensor, + **kwargs, + ) -> bool: + if not utils.koboldai_vars.inference_config.do_streaming: + return False + + if not utils.koboldai_vars.output_streaming: + return False + + data = [ + applyoutputformatting( + utils.decodenewlines(tokenizer.decode(x[-1])), + no_sentence_trimming=True, + no_single_line=True, + ) + for x in input_ids + ] + utils.koboldai_vars.actions.stream_tokens(data) + return False + + class ChatModeStopper(StoppingCriteria): + # A StoppingCriteria is used here because it seems to run after + # everything has been evaluated score-wise. + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def __call__( + self, + input_ids: torch.LongTensor, + scores: torch.FloatTensor, + **kwargs, + ) -> bool: + + if not utils.koboldai_vars.chatmode: + return False + + data = [tokenizer.decode(x) for x in input_ids] + null_character = tokenizer.encode(chr(0))[0] + if "completed" not in self.__dict__: + self.completed = [False] * len(input_ids) + for i in range(len(input_ids)): + if ( + data[i][-1 * (len(utils.koboldai_vars.chatname) + 1) :] + == utils.koboldai_vars.chatname + ":" + ): + self.completed[i] = True + if all(self.completed): + utils.koboldai_vars.generated_tkns = utils.koboldai_vars.genamt + del self.completed + return True + return False + + class CoreStopper(StoppingCriteria): + # Controls core generation stuff; aborting, counting generated tokens, etc + def __init__(self): + self.regeneration_required = False + self.halt = False + + def __call__( + self, + input_ids: torch.LongTensor, + scores: torch.FloatTensor, + **kwargs, + ) -> bool: + if not utils.koboldai_vars.inference_config.do_core: + return False + + utils.koboldai_vars.generated_tkns += 1 + + if ( + not utils.koboldai_vars.standalone + and utils.koboldai_vars.lua_koboldbridge.generated_cols + and utils.koboldai_vars.generated_tkns + != utils.koboldai_vars.lua_koboldbridge.generated_cols + ): + raise RuntimeError( + f"Inconsistency detected between KoboldAI Python and Lua backends ({utils.koboldai_vars.generated_tkns} != {utils.koboldai_vars.lua_koboldbridge.generated_cols})" + ) + + if utils.koboldai_vars.abort or ( + utils.koboldai_vars.inference_config.stop_at_genamt + and utils.koboldai_vars.generated_tkns >= utils.koboldai_vars.genamt + ): + utils.koboldai_vars.abort = False + self.regeneration_required = False + self.halt = False + return True + + if utils.koboldai_vars.standalone: + return False + + assert input_ids.ndim == 2 + + self.regeneration_required = ( + utils.koboldai_vars.lua_koboldbridge.regeneration_required + ) + self.halt = not utils.koboldai_vars.lua_koboldbridge.generating + utils.koboldai_vars.lua_koboldbridge.regeneration_required = False + + for i in ( + range(utils.koboldai_vars.numseqs) + if not utils.koboldai_vars.alt_multi_gen + else range(1) + ): + utils.koboldai_vars.lua_koboldbridge.generated[i + 1][ + utils.koboldai_vars.generated_tkns + ] = int(input_ids[i, -1].item()) + + return self.regeneration_required or self.halt + + # Sets up dynamic world info scanner + class DynamicWorldInfoScanCriteria(StoppingCriteria): + def __init__( + self, + tokenizer, + excluded_world_info: List[Set], + ): + self.tokenizer = tokenizer + self.excluded_world_info = excluded_world_info + + def __call__( + self, + input_ids: torch.LongTensor, + scores: torch.FloatTensor, + **kwargs, + ) -> bool: + + if not utils.koboldai_vars.inference_config.do_dynamic_wi: + return False + + if not utils.koboldai_vars.dynamicscan: + return False + + if len(self.excluded_world_info) != input_ids.shape[0]: + print(tokenizer.decode(self.excluded_world_info)) + print(tokenizer.decode(input_ids.shape[0])) + assert len(self.excluded_world_info) == input_ids.shape[0] + + tail = input_ids[..., -utils.koboldai_vars.generated_tkns :] + for i, t in enumerate(tail): + decoded = utils.decodenewlines(tokenizer.decode(t)) + _, _, _, found = utils.koboldai_vars.calc_ai_text( + submitted_text=decoded, send_context=False + ) + found = list(set(found) - set(self.excluded_world_info[i])) + if found: + print("FOUNDWI", found) + return True + return False + + old_get_stopping_criteria = transformers.GenerationMixin._get_stopping_criteria + + def new_get_stopping_criteria(self, *args, **kwargs): + stopping_criteria = old_get_stopping_criteria(self, *args, **kwargs) + + self.core_stopper = CoreStopper() + self.kai_scanner = DynamicWorldInfoScanCriteria( + tokenizer=tokenizer, + excluded_world_info=self.kai_model.gen_config["wi_scanner_excluded_keys"], + ) + token_streamer = TokenStreamer(tokenizer=tokenizer) + + stopping_criteria.insert(0, ChatModeStopper(tokenizer=tokenizer)) + stopping_criteria.insert(0, self.kai_scanner) + token_streamer = TokenStreamer(tokenizer=tokenizer) + stopping_criteria.insert(0, token_streamer) + # This should be last + stopping_criteria.insert(0, self.core_stopper) + + return stopping_criteria + + use_core_manipulations.get_stopping_criteria = new_get_stopping_criteria + + +class GenerationResult: + def __init__( + self, + model: InferenceModel, + out_batches: list, + prompt: list, + # Controls if generate() does it's looping thing. This should only be + # done for HF models that use that StoppingCondition + is_whole_generation: bool, + # Controls if we should trim output by prompt length + output_includes_prompt: bool = False, + # Lazy filter to cut off extra lines where we can't manipulate + # probabilities + single_line: bool = False, + ): + # Shave prompt off of encoded response when needed (HF). Decoded does + # not return prompt. + if output_includes_prompt: + self.encoded = out_batches[:, len(prompt) :] + else: + self.encoded = out_batches + + self.prompt = prompt + self.is_whole_generation = is_whole_generation + + self.decoded = [ + utils.decodenewlines(model.tokenizer.decode(enc)) for enc in self.encoded + ] + + if single_line: + self.decoded = [x.split("\n", 1)[0] for x in self.decoded] + self.encoded = np.array(model.tokenizer(self.decoded).input_ids) + + +@dataclass +class ModelCapabilities: + embedding_manipulation: bool = False + post_token_hooks: bool = False + stopper_hooks: bool = False + # TODO: Support non-live probabilities from APIs + post_token_probs: bool = False + + class InferenceModel: def __init__(self) -> None: - self.gen_config = {} + self.gen_state = {} self.token_gen_hooks = [] + self.stopper_hooks = [] + self.tokenizer = None + self.capabilties = ModelCapabilities() + + def _load(self, save_model: bool) -> None: + raise NotImplementedError + + def _get_tokenizer(self, location: str): + # TODO: This newlinemode inference might need more scrutiny + utils.koboldai_vars.newlinemode = "n" + if "xglm" in location: + # Default to newline mode if using XGLM + utils.koboldai_vars.newlinemode = "s" + if "opt" in location or "bloom" in location: + # Handle but don't convert newlines if using Fairseq models that have newlines trained in them + utils.koboldai_vars.newlinemode = "ns" + + std_kwargs = {"revision": utils.koboldai_vars.revision, "cache_dir": "cache"} + + suppliers = [ + # Fast tokenizer disabled by default as per HF docs: + # > Note: Make sure to pass use_fast=False when loading + # OPT’s tokenizer with AutoTokenizer to get the correct + # tokenizer. + lambda: AutoTokenizer.from_pretrained( + location, use_fast=False, **std_kwargs + ), + lambda: AutoTokenizer.from_pretrained(location, **std_kwargs), + # Fallback to GPT2Tokenizer + lambda: GPT2Tokenizer.from_pretrained(location, **std_kwargs), + lambda: GPT2Tokenizer.from_pretrained("gpt2", **std_kwargs), + ] + + for i, try_get_tokenizer in enumerate(suppliers): + try: + return try_get_tokenizer() + except Exception as e: + # If we error on each attempt, raise the last one + if i == len(suppliers) - 1: + raise e + + def core_generate( + self, + text: list, + _min: int, + _max: int, + found_entries: set, + is_core: bool = False, + ): + # This generation function is tangled with koboldai_vars intentionally. It + # is meant for the story and nothing else. + + start_time = time.time() + gen_in = torch.tensor(text, dtype=torch.long)[None] + logger.debug( + "core_generate: torch.tensor time {}s".format(time.time() - start_time) + ) + + start_time = time.time() + if utils.koboldai_vars.is_model_torch(): + # Torch stuff + if utils.koboldai_vars.full_determinism: + torch.manual_seed(utils.koboldai_vars.seed) + + if utils.koboldai_vars.sp is not None: + assert self.capabilties.embedding_manipulation + soft_tokens = torch.arange( + self.model.config.vocab_size, + self.model.config.vocab_size + utils.koboldai_vars.sp.shape[0], + ) + gen_in = torch.cat((soft_tokens[None], gen_in), dim=-1) + elif utils.koboldai_vars.use_colab_tpu: + if utils.koboldai_vars.full_determinism: + tpu_mtj_backend.set_rng_seed(utils.koboldai_vars.seed) + + logger.debug( + "core_generate: Model Setup (SP, etc) time {}s".format( + time.time() - start_time + ) + ) + + if ( + gen_in.shape[-1] + utils.koboldai_vars.genamt + > utils.koboldai_vars.max_length + ): + logger.error("gen_in.shape[-1]: {}".format(gen_in.shape[-1])) + logger.error( + "utils.koboldai_vars.genamt: {}".format(utils.koboldai_vars.genamt) + ) + logger.error( + "utils.koboldai_vars.max_length: {}".format( + utils.koboldai_vars.max_length + ) + ) + assert ( + gen_in.shape[-1] + utils.koboldai_vars.genamt + <= utils.koboldai_vars.max_length + ) + + start_time = time.time() + gen_in = gen_in.to(utils.get_auxilary_device()) + + logger.debug( + "core_generate: gen_in to device time {}s".format(time.time() - start_time) + ) + start_time = time.time() + + found_entries = found_entries or set() + + self.gen_state["wi_scanner_excluded_keys"] = found_entries + + utils.koboldai_vars._prompt = utils.koboldai_vars.prompt + + with torch.no_grad(): + already_generated = 0 + numseqs = utils.koboldai_vars.numseqs + total_gens = None + + for i in range( + utils.koboldai_vars.numseqs if utils.koboldai_vars.alt_multi_gen else 1 + ): + while True: + # The reason this is a loop is due to how Dynamic WI works. We + # cannot simply add the WI to the context mid-generation, so we + # stop early, and then insert WI, then continue generating. That + # stopping and continuing is this loop. + + start_time = time.time() + result = self.raw_generate( + gen_in[0], + max_new=utils.koboldai_vars.genamt, + do_streaming=utils.koboldai_vars.output_streaming, + do_dynamic_wi=utils.koboldai_vars.dynamicscan, + batch_count=numseqs + if not utils.koboldai_vars.alt_multi_gen + else 1, + # Real max length is handled by CoreStopper. + bypass_hf_maxlength=utils.koboldai_vars.dynamicscan, + is_core=True, + ) + logger.debug( + "core_generate: run raw_generate pass {} {}s".format( + already_generated, time.time() - start_time + ) + ) + + genout = result.encoded + + already_generated += len(genout[0]) + + try: + assert ( + already_generated + <= utils.koboldai_vars.genamt * utils.koboldai_vars.numseqs + if utils.koboldai_vars.alt_multi_gen + else 1 + ) + except AssertionError: + print("AlreadyGenerated", already_generated) + print("genamt", utils.koboldai_vars.genamt) + raise + + if result.is_whole_generation: + break + + # Generation stopped; why? + # If we have been told to halt, we have reached our target token + # amount (controlled by halt), or Dynamic WI has not told us to + # stop temporarily to insert WI, we can assume that we are done + # generating. We shall break. + if ( + model.core_stopper.halt + or not model.core_stopper.regeneration_required + ): + break + + # Now we are doing stuff for Dynamic WI. + assert genout.ndim >= 2 + assert genout.shape[0] == utils.koboldai_vars.numseqs + + if ( + utils.koboldai_vars.lua_koboldbridge.generated_cols + and utils.koboldai_vars.generated_tkns + != utils.koboldai_vars.lua_koboldbridge.generated_cols + ): + raise RuntimeError( + f"Inconsistency detected between KoboldAI Python and Lua backends ({utils.koboldai_vars.generated_tkns} != {utils.koboldai_vars.lua_koboldbridge.generated_cols})" + ) + + if already_generated != utils.koboldai_vars.generated_tkns: + print("already_generated: {}".format(already_generated)) + print( + "generated_tkns: {}".format( + utils.koboldai_vars.generated_tkns + ) + ) + raise RuntimeError("WI scanning error") + + for r in range(utils.koboldai_vars.numseqs): + for c in range(already_generated): + assert ( + utils.koboldai_vars.lua_koboldbridge.generated[r + 1][ + c + 1 + ] + is not None + ) + genout[r][ + genout.shape[-1] - already_generated + c + ] = utils.koboldai_vars.lua_koboldbridge.generated[r + 1][ + c + 1 + ] + + encoded = [] + + for i in range(utils.koboldai_vars.numseqs): + txt = utils.decodenewlines( + self.tokenizer.decode(genout[i, -already_generated:]) + ) + # winfo, mem, anotetxt, _found_entries = calcsubmitbudgetheader(txt, force_use_txt=True, actions=utils.koboldai_vars.actions) + # txt, _, _ = calcsubmitbudget(len(utils.koboldai_vars.actions), winfo, mem, anotetxt, utils.koboldai_vars.actions, submission=txt) + txt, _, _, _found_entries = utils.koboldai_vars.calc_ai_text( + submitted_text=txt, send_context=False + ) + found_entries[i].update(_found_entries) + encoded.append( + torch.tensor(txt, dtype=torch.long, device=genout.device) + ) + + max_length = len(max(encoded, key=len)) + encoded = torch.stack( + tuple( + torch.nn.functional.pad( + e, + (max_length - len(e), 0), + value=self.model.config.pad_token_id + or self.model.config.eos_token_id, + ) + for e in encoded + ) + ) + genout = torch.cat( + ( + encoded, + genout[..., -already_generated:], + ), + dim=-1, + ) + + if utils.koboldai_vars.sp is not None: + soft_tokens = torch.arange( + self.model.config.vocab_size, + self.model.config.vocab_size + + utils.koboldai_vars.sp.shape[0], + device=genout.device, + ) + genout = torch.cat( + (soft_tokens.tile(utils.koboldai_vars.numseqs, 1), genout), + dim=-1, + ) + + assert ( + genout.shape[-1] + + utils.koboldai_vars.genamt + - already_generated + <= utils.koboldai_vars.max_length + ) + gen_in = genout + numseqs = 1 + if total_gens is None: + total_gens = genout + else: + total_gens = torch.cat((total_gens, genout)) + + return total_gens, already_generated + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ) -> GenerationResult: + raise NotImplementedError + + def raw_generate( + self, + # prompt is either a string (text) or a list (token ids) + prompt: Union[str, list, np.ndarray], + max_new: int, + do_streaming: bool = False, + do_dynamic_wi: bool = False, + batch_count: int = 1, + bypass_hf_maxlength: bool = False, + generation_settings: Optional[dict] = None, + is_core: bool = False, + single_line: bool = False, + found_entries: set = (), + ) -> GenerationResult: + """A wrapper around _raw_generate() that handles timing and some other minute stuff.""" + # TODO: Support singleline outside of torch + + self.gen_state["do_streaming"] = do_streaming + self.gen_state["do_dynamic_wi"] = do_dynamic_wi + + # Dynamic WI depends on this!!! This is a main gen call. + self.gen_state["stop_at_genamt"] = do_dynamic_wi + + # Makes stopping criteria hook happy + self.gen_state["wi_scanner_excluded_keys"] = self.gen_state.get( + "wi_scanner_excluded_keys", set() + ) + + utils.koboldai_vars.inference_config.do_core = is_core + gen_settings = GenerationSettings(*(generation_settings or {})) + + if isinstance(prompt, torch.Tensor): + prompt_tokens = prompt.cpu().numpy() + elif isinstance(prompt, list): + prompt_tokens = np.array(prompt) + elif isinstance(prompt, str): + prompt_tokens = np.array(self.tokenizer.encode(prompt)) + else: + raise ValueError(f"Prompt is {type(prompt)}. Not a fan!") + + assert isinstance(prompt_tokens, np.ndarray) + assert len(prompt_tokens.shape) == 1 + + if utils.koboldai_vars.model == "ReadOnly": + raise NotImplementedError("No loaded model") + + result: GenerationResult + time_start = time.time() + + with use_core_manipulations(): + self._raw_generate( + prompt_tokens=prompt_tokens, + max_new=max_new, + batch_count=batch_count, + gen_settings=gen_settings, + single_line=single_line, + ) + # if i_vars.use_colab_tpu or koboldai_vars.model in ( + # "TPUMeshTransformerGPTJ", + # "TPUMeshTransformerGPTNeoX", + # ): + time_end = round(time.time() - time_start, 2) + tokens_per_second = round(len(result.encoded[0]) / time_end, 2) + + if not utils.koboldai_vars.quiet: + logger.info( + f"Generated {len(result.encoded[0])} tokens in {time_end} seconds, for an average rate of {tokens_per_second} tokens per second." + ) + + return result def generate( self, @@ -54,7 +1201,106 @@ class InferenceModel: hook(input_ids) -class HFTorchInferenceModel: +class HFMTJInferenceModel: + def __init__( + self, + model_name: str, + ) -> None: + super().__init__() + + self.model_name = model_name + + self.model = None + self.tokenizer = None + self.model_config = None + self.capabilties = ModelCapabilities( + embedding_manipulation=False, + post_token_hooks=False, + stopper_hooks=False, + post_token_probs=False, + ) + + def get_soft_tokens() -> np.array: + soft_tokens = None + + if utils.koboldai_vars.sp is None: + tensor = np.zeros( + ( + 1, + tpu_mtj_backend.params.get( + "d_embed", tpu_mtj_backend.params["d_model"] + ), + ), + dtype=np.float32, + ) + rows = tensor.shape[0] + padding_amount = ( + tpu_mtj_backend.params["seq"] + - ( + tpu_mtj_backend.params["seq"] + % -tpu_mtj_backend.params["cores_per_replica"] + ) + - rows + ) + tensor = np.pad(tensor, ((0, padding_amount), (0, 0))) + tensor = tensor.reshape( + tpu_mtj_backend.params["cores_per_replica"], + -1, + tpu_mtj_backend.params.get( + "d_embed", tpu_mtj_backend.params["d_model"] + ), + ) + utils.koboldai_vars.sp = tpu_mtj_backend.shard_xmap(tensor) + + soft_tokens = np.arange( + tpu_mtj_backend.params["n_vocab"] + + tpu_mtj_backend.params["n_vocab_padding"], + tpu_mtj_backend.params["n_vocab"] + + tpu_mtj_backend.params["n_vocab_padding"] + + utils.koboldai_vars.sp_length, + dtype=np.uint32, + ) + return soft_tokens + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ): + soft_tokens = self.get_soft_tokens() + + genout = tpool.execute( + tpu_mtj_backend.infer_static, + np.uint32(prompt_tokens), + gen_len=max_new, + temp=gen_settings.temp, + top_p=gen_settings.top_p, + top_k=gen_settings.top_k, + tfs=gen_settings.tfs, + typical=gen_settings.typical, + top_a=gen_settings.top_a, + numseqs=batch_count, + repetition_penalty=gen_settings.rep_pen, + rpslope=gen_settings.rep_pen_slope, + rprange=gen_settings.rep_pen_range, + soft_embeddings=utils.koboldai_vars.sp, + soft_tokens=soft_tokens, + sampler_order=gen_settings.sampler_order, + ) + genout = np.array(genout) + + return GenerationResult( + out_batches=genout, + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) + + +class HFTorchInferenceModel(InferenceModel): def __init__( self, model_name: str, @@ -70,70 +1316,73 @@ class HFTorchInferenceModel: self.model = None self.tokenizer = None self.model_config = None + self.capabilties = ModelCapabilities( + embedding_manipulation=True, + post_token_hooks=True, + stopper_hooks=True, + post_token_probs=True, + ) - def generate( + def _raw_generate( self, prompt_tokens: Union[List[int], torch.Tensor], - max_new_tokens: int, - do_streaming: bool = False, - do_dynamic_wi: bool = False, + max_new: int, + gen_settings: GenerationSettings, single_line: bool = False, batch_count: int = 1, - ) -> torch.Tensor: - raise NotImplementedError("AHHHH") + ): + if not isinstance(prompt_tokens, torch.Tensor): + gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] + else: + gen_in = prompt_tokens + + device = utils.get_auxilary_device() + gen_in = gen_in.to(device) + + additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else [] + + with torch.no_grad(): + start_time = time.time() + genout = self.model.generate( + gen_in, + do_sample=True, + max_length=min( + len(prompt_tokens) + max_new, utils.koboldai_vars.max_length + ), + repetition_penalty=1.0, + bad_words_ids=utils.koboldai_vars.badwordsids + + additional_bad_words_ids, + use_cache=True, + num_return_sequences=batch_count, + ) + logger.debug( + "torch_raw_generate: run generator {}s".format(time.time() - start_time) + ) + + return genout - self.gen_config = { - "do_streaming": do_streaming, - "do_dynamic_wi": do_dynamic_wi, - "stop_at_genamt": do_dynamic_wi, - } - def _get_model(self, location: str, tf_kwargs: Dict): try: return AutoModelForCausalLM.from_pretrained( location, revision=utils.koboldai_vars.revision, cache_dir="cache", - **tf_kwargs + **tf_kwargs, ) except Exception as e: if "out of memory" in traceback.format_exc().lower(): - raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") + raise RuntimeError( + "One of your GPUs ran out of memory when KoboldAI tried to load your model." + ) return GPTNeoForCausalLM.from_pretrained( location, revision=utils.koboldai_vars.revision, cache_dir="cache", - **tf_kwargs + **tf_kwargs, ) - - def _get_tokenizer(self, location: str): - std_kwargs = {"revision": utils.koboldai_vars.revision, "cache_dir": "cache"} - suppliers = [ - # Fast tokenizer disabled by default as per HF docs: - # > Note: Make sure to pass use_fast=False when loading - # OPT’s tokenizer with AutoTokenizer to get the correct - # tokenizer. - lambda: AutoTokenizer.from_pretrained(location, use_fast=False, **std_kwargs), - lambda: AutoTokenizer.from_pretrained(location, **std_kwargs), - - # Fallback to GPT2Tokenizer - lambda: GPT2Tokenizer.from_pretrained(location, **std_kwargs), - lambda: GPT2Tokenizer.from_pretrained("gpt2", **std_kwargs), - ] - - for i, try_get_tokenizer in enumerate(suppliers): - try: - return try_get_tokenizer() - except Exception as e: - # If we error on each attempt, raise the last one - if i == len(suppliers) - 1: - raise e - def get_local_model_path( - self, - legacy: bool = False, - ignore_existance: bool = False + self, legacy: bool = False, ignore_existance: bool = False ) -> Optional[str]: """ Returns a string of the model's path locally, or None if it is not downloaded. @@ -145,28 +1394,30 @@ class HFTorchInferenceModel: ret = basename else: ret = os.path.join("models", basename) - + if os.path.isdir(ret) or ignore_existance: return ret return None - def get_hidden_size(self) -> int: return self.model.get_input_embeddings().embedding_dim - def _move_to_devices(self) -> None: if not utils.koboldai_vars.breakmodel: if utils.koboldai_vars.usegpu: self.model = self.model.half().to(utils.koboldai_vars.gpu_device) else: - self.model = self.model.to('cpu').float() + self.model = self.model.to("cpu").float() return for key, value in self.model.state_dict().items(): - target_dtype = torch.float32 if breakmodel.primary_device == "cpu" else torch.float16 + target_dtype = ( + torch.float32 if breakmodel.primary_device == "cpu" else torch.float16 + ) if value.dtype is not target_dtype: - accelerate.utils.set_module_tensor_to_device(self.model, key, target_dtype) + accelerate.utils.set_module_tensor_to_device( + self.model, key, target_dtype + ) disk_blocks = breakmodel.disk_blocks gpu_blocks = breakmodel.gpu_blocks @@ -176,7 +1427,11 @@ class HFTorchInferenceModel: for name in utils.layers_module_names: layer = int(name.rsplit(".", 1)[1]) - device = ("disk" if layer < disk_blocks else "cpu") if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) + device = ( + ("disk" if layer < disk_blocks else "cpu") + if layer < ram_blocks + else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) + ) device_map[name] = device for name in utils.get_missing_module_names(self.model, list(device_map.keys())): @@ -187,7 +1442,7 @@ class HFTorchInferenceModel: device_map, main_device=breakmodel.primary_device, offload_buffers=True, - offload_dir="accelerate-disk-cache" + offload_dir="accelerate-disk-cache", ) gc.collect() @@ -231,7 +1486,7 @@ class HFTorchInferenceModel: # breakmodel.move_hidden_layers(model.model, model.model.layers) # else: # breakmodel.move_hidden_layers(model.model.decoder, model.model.decoder.layers) - + # Function to patch transformers to use our soft prompt def patch_embedding(self) -> None: if getattr(Embedding, "_koboldai_patch_causallm_model", None): @@ -241,9 +1496,13 @@ class HFTorchInferenceModel: old_embedding_call = Embedding.__call__ kai_model = self + def new_embedding_call(self, input_ids, *args, **kwargs): # Don't touch embeddings for models other than the core inference model (that's us!) - if Embedding._koboldai_patch_causallm_model.get_input_embeddings() is not self: + if ( + Embedding._koboldai_patch_causallm_model.get_input_embeddings() + is not self + ): return old_embedding_call(self, input_ids, *args, **kwargs) assert input_ids is not None @@ -255,7 +1514,9 @@ class HFTorchInferenceModel: inputs_embeds = old_embedding_call(self, input_ids, *args, **kwargs) if utils.koboldai_vars.sp is not None: - utils.koboldai_vars.sp = utils.koboldai_vars.sp.to(inputs_embeds.dtype).to(inputs_embeds.device) + utils.koboldai_vars.sp = utils.koboldai_vars.sp.to( + inputs_embeds.dtype + ).to(inputs_embeds.device) inputs_embeds = torch.where( (shifted_input_ids >= 0)[..., None], utils.koboldai_vars.sp[shifted_input_ids.clamp(min=0)], @@ -267,7 +1528,6 @@ class HFTorchInferenceModel: Embedding.__call__ = new_embedding_call Embedding._koboldai_patch_causallm_model = self.model - def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True): if not self.lazy_load: return @@ -311,7 +1571,8 @@ class HFTorchInferenceModel: utils.koboldai_vars.gpu_device if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu else "cpu" - if not utils.koboldai_vars.hascuda or not utils.koboldai_vars.breakmodel + if not utils.koboldai_vars.hascuda + or not utils.koboldai_vars.breakmodel else breakmodel.primary_device ) else: @@ -331,7 +1592,8 @@ class HFTorchInferenceModel: else "disk" if layer < disk_blocks and layer < ram_blocks else "cpu" - if not utils.koboldai_vars.hascuda or not utils.koboldai_vars.breakmodel + if not utils.koboldai_vars.hascuda + or not utils.koboldai_vars.breakmodel else "shared" if layer < ram_blocks else bisect.bisect_right( @@ -431,7 +1693,10 @@ class HFTorchInferenceModel: convert_to_float16 and breakmodel.primary_device != "cpu" and utils.koboldai_vars.hascuda - and (utils.koboldai_vars.breakmodel or utils.koboldai_vars.usegpu) + and ( + utils.koboldai_vars.breakmodel + or utils.koboldai_vars.usegpu + ) and model_dict[key].dtype is torch.float32 ): model_dict[key] = model_dict[key].to(torch.float16) @@ -475,7 +1740,8 @@ class HFTorchInferenceModel: and breakmodel.primary_device != "cpu" and utils.koboldai_vars.hascuda and ( - utils.koboldai_vars.breakmodel or utils.koboldai_vars.usegpu + utils.koboldai_vars.breakmodel + or utils.koboldai_vars.usegpu ) ): dtype = torch.float16 @@ -513,7 +1779,11 @@ class HFTorchInferenceModel: @contextlib.contextmanager def _maybe_use_float16(self, always_use: bool = False): - if always_use or (utils.koboldai_vars.hascuda and self.low_mem and (utils.koboldai_vars.usegpu or utils.koboldai_vars.breakmodel)): + if always_use or ( + utils.koboldai_vars.hascuda + and self.low_mem + and (utils.koboldai_vars.usegpu or utils.koboldai_vars.breakmodel) + ): original_dtype = torch.get_default_dtype() torch.set_default_dtype(torch.float16) yield True @@ -526,157 +1796,215 @@ class HFTorchInferenceModel: # HACK: Tttttttterrrible structure_hack class colors: - PURPLE = '\033[95m' - BLUE = '\033[94m' - CYAN = '\033[96m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - END = '\033[0m' - UNDERLINE = '\033[4m' + PURPLE = "\033[95m" + BLUE = "\033[94m" + CYAN = "\033[96m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + RED = "\033[91m" + END = "\033[0m" + UNDERLINE = "\033[4m" device_count = torch.cuda.device_count() - if(device_count < 2): + if device_count < 2: primary = None - gpu_blocks = breakmodel.gpu_blocks + (device_count - len(breakmodel.gpu_blocks))*[0] + gpu_blocks = breakmodel.gpu_blocks + ( + device_count - len(breakmodel.gpu_blocks) + ) * [0] print(f"{colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{colors.END}") for i in range(device_count): name = torch.cuda.get_device_name(i) - if(len(name) > 47): + if len(name) > 47: name = "..." + name[-44:] row_color = colors.END sep_color = colors.YELLOW - print(f"{row_color}{colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{colors.END}") + print( + f"{row_color}{colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{colors.END}" + ) row_color = colors.END sep_color = colors.YELLOW - if(utils.HAS_ACCELERATE): - print(f"{row_color}{colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){colors.END}") - print(f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){colors.END}") + if utils.HAS_ACCELERATE: + print( + f"{row_color}{colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){colors.END}" + ) + print( + f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){colors.END}" + ) def breakmodel_device_config(self, config): # TODO: Find a better place for this or rework this # HACK: Tttttttterrrible structure_hack class colors: - PURPLE = '\033[95m' - BLUE = '\033[94m' - CYAN = '\033[96m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - END = '\033[0m' - UNDERLINE = '\033[4m' + PURPLE = "\033[95m" + BLUE = "\033[94m" + CYAN = "\033[96m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + RED = "\033[91m" + END = "\033[0m" + UNDERLINE = "\033[4m" global breakmodel, generator import breakmodel + n_layers = utils.num_layers(config) if utils.args.cpu: - breakmodel.gpu_blocks = [0]*n_layers + breakmodel.gpu_blocks = [0] * n_layers return - elif(utils.args.breakmodel_gpulayers is not None or (utils.HAS_ACCELERATE and utils.args.breakmodel_disklayers is not None)): + elif utils.args.breakmodel_gpulayers is not None or ( + utils.HAS_ACCELERATE and utils.args.breakmodel_disklayers is not None + ): try: - if(not utils.args.breakmodel_gpulayers): + if not utils.args.breakmodel_gpulayers: breakmodel.gpu_blocks = [] else: - breakmodel.gpu_blocks = list(map(int, utils.args.breakmodel_gpulayers.split(','))) + breakmodel.gpu_blocks = list( + map(int, utils.args.breakmodel_gpulayers.split(",")) + ) assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count() s = n_layers for i in range(len(breakmodel.gpu_blocks)): - if(breakmodel.gpu_blocks[i] <= -1): + if breakmodel.gpu_blocks[i] <= -1: breakmodel.gpu_blocks[i] = s break else: s -= breakmodel.gpu_blocks[i] assert sum(breakmodel.gpu_blocks) <= n_layers n_layers -= sum(breakmodel.gpu_blocks) - if(utils.args.breakmodel_disklayers is not None): + if utils.args.breakmodel_disklayers is not None: assert utils.args.breakmodel_disklayers <= n_layers breakmodel.disk_blocks = utils.args.breakmodel_disklayers n_layers -= utils.args.breakmodel_disklayers except: - logger.warning("--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0.") + logger.warning( + "--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0." + ) breakmodel.gpu_blocks = [n_layers] n_layers = 0 - elif(utils.args.breakmodel_layers is not None): - breakmodel.gpu_blocks = [n_layers - max(0, min(n_layers, utils.args.breakmodel_layers))] + elif utils.args.breakmodel_layers is not None: + breakmodel.gpu_blocks = [ + n_layers - max(0, min(n_layers, utils.args.breakmodel_layers)) + ] n_layers -= sum(breakmodel.gpu_blocks) - elif(utils.args.model is not None): + elif utils.args.model is not None: logger.info("Breakmodel not specified, assuming GPU 0") breakmodel.gpu_blocks = [n_layers] n_layers = 0 else: device_count = torch.cuda.device_count() - if(device_count > 1): - print(colors.CYAN + "\nPlease select one of your GPUs to be your primary GPU.") - print("VRAM usage in your primary GPU will be higher than for your other ones.") + if device_count > 1: + print( + colors.CYAN + + "\nPlease select one of your GPUs to be your primary GPU." + ) + print( + "VRAM usage in your primary GPU will be higher than for your other ones." + ) print("It is recommended you make your fastest GPU your primary GPU.") self.breakmodel_device_list(n_layers) - while(True): + while True: primaryselect = input("device ID> ") - if(primaryselect.isnumeric() and 0 <= int(primaryselect) < device_count): + if ( + primaryselect.isnumeric() + and 0 <= int(primaryselect) < device_count + ): breakmodel.primary_device = int(primaryselect) break else: - print(f"{colors.RED}Please enter an integer between 0 and {device_count-1}.{colors.END}") + print( + f"{colors.RED}Please enter an integer between 0 and {device_count-1}.{colors.END}" + ) else: breakmodel.primary_device = 0 - print(colors.PURPLE + "\nIf you don't have enough VRAM to run the model on a single GPU") - print("you can split the model between your CPU and your GPU(s), or between") + print( + colors.PURPLE + + "\nIf you don't have enough VRAM to run the model on a single GPU" + ) + print( + "you can split the model between your CPU and your GPU(s), or between" + ) print("multiple GPUs if you have more than one.") print("By putting more 'layers' on a GPU or CPU, more computations will be") - print("done on that device and more VRAM or RAM will be required on that device") + print( + "done on that device and more VRAM or RAM will be required on that device" + ) print("(roughly proportional to number of layers).") - print("It should be noted that GPUs are orders of magnitude faster than the CPU.") - print(f"This model has{colors.YELLOW} {n_layers} {colors.PURPLE}layers.{colors.END}\n") + print( + "It should be noted that GPUs are orders of magnitude faster than the CPU." + ) + print( + f"This model has{colors.YELLOW} {n_layers} {colors.PURPLE}layers.{colors.END}\n" + ) for i in range(device_count): - self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device, selected=i) - print(f"{colors.CYAN}\nHow many of the remaining{colors.YELLOW} {n_layers} {colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{colors.END}\n") - while(True): + self.breakmodel_device_list( + n_layers, primary=breakmodel.primary_device, selected=i + ) + print( + f"{colors.CYAN}\nHow many of the remaining{colors.YELLOW} {n_layers} {colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{colors.END}\n" + ) + while True: layerselect = input("# of layers> ") - if((layerselect.isnumeric() or layerselect.strip() == '-1') and -1 <= int(layerselect) <= n_layers): + if ( + layerselect.isnumeric() or layerselect.strip() == "-1" + ) and -1 <= int(layerselect) <= n_layers: layerselect = int(layerselect) layerselect = n_layers if layerselect == -1 else layerselect breakmodel.gpu_blocks.append(layerselect) n_layers -= layerselect break else: - print(f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}") - if(n_layers == 0): + print( + f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}" + ) + if n_layers == 0: break - if(utils.HAS_ACCELERATE and n_layers > 0): - self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device, selected=-1) - print(f"{colors.CYAN}\nHow many of the remaining{colors.YELLOW} {n_layers} {colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{colors.END}\n") - while(True): + if utils.HAS_ACCELERATE and n_layers > 0: + self.breakmodel_device_list( + n_layers, primary=breakmodel.primary_device, selected=-1 + ) + print( + f"{colors.CYAN}\nHow many of the remaining{colors.YELLOW} {n_layers} {colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{colors.END}\n" + ) + while True: layerselect = input("# of layers> ") - if((layerselect.isnumeric() or layerselect.strip() == '-1') and -1 <= int(layerselect) <= n_layers): + if ( + layerselect.isnumeric() or layerselect.strip() == "-1" + ) and -1 <= int(layerselect) <= n_layers: layerselect = int(layerselect) layerselect = n_layers if layerselect == -1 else layerselect breakmodel.disk_blocks = layerselect n_layers -= layerselect break else: - print(f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}") + print( + f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}" + ) logger.init_ok("Final device configuration:", status="Info") self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device) # If all layers are on the same device, use the old GPU generation mode - while(len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0): + while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0: breakmodel.gpu_blocks.pop() - if(len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (-1, utils.num_layers(config))): + if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in ( + -1, + utils.num_layers(config), + ): utils.koboldai_vars.breakmodel = False utils.koboldai_vars.usegpu = True - utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks)-1 + utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1 return - if(not breakmodel.gpu_blocks): + if not breakmodel.gpu_blocks: logger.warning("Nothing assigned to a GPU, reverting to CPU only mode") import breakmodel + breakmodel.primary_device = "cpu" utils.koboldai_vars.breakmodel = False utils.koboldai_vars.usegpu = False @@ -695,19 +2023,25 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model if utils.koboldai_vars.model == "NeoCustom": - utils.koboldai_vars.model = os.path.basename(os.path.normpath(utils.koboldai_vars.custmodpth)) + utils.koboldai_vars.model = os.path.basename( + os.path.normpath(utils.koboldai_vars.custmodpth) + ) # If we specify a model and it's in the root directory, we need to move # it to the models directory (legacy folder structure to new) if self.get_local_model_path(legacy=True): shutil.move( self.get_local_model_path(legacy=True, ignore_existance=True), - self.get_local_model_path(ignore_existance=True) + self.get_local_model_path(ignore_existance=True), ) - + # Get the model_type from the config or assume a model type if it isn't present try: - model_config = AutoConfig.from_pretrained(self.get_local_model_path() or utils.koboldai_vars.model, revision=utils.koboldai_vars.revision, cache_dir="cache") + model_config = AutoConfig.from_pretrained( + self.get_local_model_path() or utils.koboldai_vars.model, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + ) utils.koboldai_vars.model_type = model_config.model_type except ValueError as e: utils.koboldai_vars.model_type = { @@ -716,10 +2050,11 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): }.get(utils.koboldai_vars.model) if not utils.koboldai_vars.model_type: - logger.warning("No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)") + logger.warning( + "No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)" + ) utils.koboldai_vars.model_type = "gpt_neo" - tf_kwargs = { "low_cpu_mem_usage": True, } @@ -731,15 +2066,22 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): # Also, lazy loader doesn't support GPT-2 models utils.koboldai_vars.lazy_load = False - + # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors - if utils.koboldai_vars.lazy_load and utils.koboldai_vars.hascuda and utils.koboldai_vars.breakmodel and not utils.koboldai_vars.nobreakmodel: + if ( + utils.koboldai_vars.lazy_load + and utils.koboldai_vars.hascuda + and utils.koboldai_vars.breakmodel + and not utils.koboldai_vars.nobreakmodel + ): self.breakmodel_device_config(model_config) if utils.koboldai_vars.lazy_load: # If we're using lazy loader, we need to figure out what the model's hidden layers are called - with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True): + with torch_lazy_loader.use_lazy_torch_load( + dematerialized_modules=True, use_accelerate_init_empty_weights=True + ): try: metamodel = AutoModelForCausalLM.from_config(model_config) except Exception as e: @@ -751,8 +2093,10 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): # Download model from Huggingface if it does not exist, otherwise load locally with self._maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load( enable=utils.koboldai_vars.lazy_load, - callback=self._get_lazy_load_callback(utils.num_layers(model_config)) if utils.koboldai_vars.lazy_load else None, - dematerialized_modules=True + callback=self._get_lazy_load_callback(utils.num_layers(model_config)) + if utils.koboldai_vars.lazy_load + else None, + dematerialized_modules=True, ): if utils.koboldai_vars.lazy_load: # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time @@ -768,11 +2112,12 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): # _rebuild_tensor patch for casting dtype and supporting LazyTensors old_rebuild_tensor = torch._utils._rebuild_tensor + def new_rebuild_tensor( storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, - stride + stride, ): if not isinstance(storage, torch_lazy_loader.LazyTensor): dtype = storage.dtype @@ -789,14 +2134,19 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): torch._utils._rebuild_tensor = old_rebuild_tensor if save_model: - self.tokenizer.save_pretrained(self.get_local_model_path(ignore_existance=True)) + self.tokenizer.save_pretrained( + self.get_local_model_path(ignore_existance=True) + ) if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks: # Use save_pretrained to convert fp32 models to fp16, # unless we are using disk cache because save_pretrained # is not supported in that case model = model.half() - model.save_pretrained(self.get_local_model_path(ignore_existance=True), max_shard_size="500MiB") + model.save_pretrained( + self.get_local_model_path(ignore_existance=True), + max_shard_size="500MiB", + ) else: # For fp16 models, we can just copy the model files directly @@ -805,40 +2155,51 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): import transformers.file_utils import huggingface_hub - legacy = packaging.version.parse(transformers_version) < packaging.version.parse("4.22.0.dev0") + legacy = packaging.version.parse( + transformers_version + ) < packaging.version.parse("4.22.0.dev0") # Save the config.json shutil.move( - os.path.realpath(huggingface_hub.hf_hub_download( - utils.koboldai_vars.model, - transformers.configuration_utils.CONFIG_NAME, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=legacy - )), + os.path.realpath( + huggingface_hub.hf_hub_download( + utils.koboldai_vars.model, + transformers.configuration_utils.CONFIG_NAME, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=legacy, + ) + ), os.path.join( self.get_local_model_path(ignore_existance=True), - transformers.configuration_utils.CONFIG_NAME - ) + transformers.configuration_utils.CONFIG_NAME, + ), ) if utils.num_shards is None: # Save the pytorch_model.bin or model.safetensors of an unsharded model - for possible_weight_name in [transformers.modeling_utils.WEIGHTS_NAME, "model.safetensors"]: + for possible_weight_name in [ + transformers.modeling_utils.WEIGHTS_NAME, + "model.safetensors", + ]: try: shutil.move( - os.path.realpath(huggingface_hub.hf_hub_download( - utils.koboldai_vars.model, - possible_weight_name, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=legacy - )), + os.path.realpath( + huggingface_hub.hf_hub_download( + utils.koboldai_vars.model, + possible_weight_name, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=legacy, + ) + ), os.path.join( - self.get_local_model_path(ignore_existance=True), + self.get_local_model_path( + ignore_existance=True + ), possible_weight_name, - ) + ), ) except Exception as e: if possible_weight_name == "model.safetensors": @@ -854,29 +2215,41 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): os.path.realpath(utils.from_pretrained_index_filename), os.path.join( self.get_local_model_path(ignore_existance=True), - transformers.modeling_utils.WEIGHTS_INDEX_NAME - ) + transformers.modeling_utils.WEIGHTS_INDEX_NAME, + ), ) # Then save the pytorch_model-#####-of-#####.bin files for filename in filenames: shutil.move( - os.path.realpath(huggingface_hub.hf_hub_download( - utils.koboldai_vars.model, - filename, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=legacy - )), + os.path.realpath( + huggingface_hub.hf_hub_download( + utils.koboldai_vars.model, + filename, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=legacy, + ) + ), os.path.join( - self.get_local_model_path(ignore_existance=True), - filename - ) + self.get_local_model_path( + ignore_existance=True + ), + filename, + ), ) shutil.rmtree("cache/") - if utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj"): - utils.koboldai_vars.badwordsids = [[v] for k, v in self.tokenizer.get_vocab().items() if any(c in str(k) for c in "<>[]") if utils.koboldai_vars.newlinemode != "s" or str(k) != ""] + if ( + utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default + and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj") + ): + utils.koboldai_vars.badwordsids = [ + [v] + for k, v in self.tokenizer.get_vocab().items() + if any(c in str(k) for c in "<>[]") + if utils.koboldai_vars.newlinemode != "s" or str(k) != "" + ] self.patch_embedding() @@ -896,11 +2269,12 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): self._move_to_devices() else: # Use CPU - self.model = self.model.to('cpu').float() + self.model = self.model.to("cpu").float() elif breakmodel.disk_blocks > 0: self._move_to_devices() else: - self.model = self.model.to('cpu').float() + self.model = self.model.to("cpu").float() + self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() @@ -912,32 +2286,49 @@ class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel): for possible_config_path in [ utils.koboldai_vars.custmodpth, - os.path.join("models", utils.koboldai_vars.custmodpth) + os.path.join("models", utils.koboldai_vars.custmodpth), ]: try: - with open(os.path.join(possible_config_path, "config.json"), "r") as file: + with open( + os.path.join(possible_config_path, "config.json"), "r" + ) as file: # Unused? self.model_config = json.load(file) model_path = possible_config_path break except FileNotFoundError: pass - + if not model_path: raise RuntimeError("Empty model_path!") with self._maybe_use_float16(): try: - self.model = GPT2LMHeadModel.from_pretrained(utils.koboldai_vars.custmodpth, revision=utils.koboldai_vars.revision, cache_dir="cache") - self.tokenizer = GPT2Tokenizer.from_pretrained(utils.koboldai_vars.custmodpth, revision=utils.koboldai_vars.revision, cache_dir="cache") + self.model = GPT2LMHeadModel.from_pretrained( + utils.koboldai_vars.custmodpth, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + ) + self.tokenizer = GPT2Tokenizer.from_pretrained( + utils.koboldai_vars.custmodpth, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + ) except Exception as e: if "out of memory" in traceback.format_exc().lower(): - raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.") + raise RuntimeError( + "One of your GPUs ran out of memory when KoboldAI tried to load your model." + ) raise e if save_model: - self.model.save_pretrained(self.get_local_model_path(ignore_existance=True), max_shard_size="500MiB") - self.tokenizer.save_pretrained(self.get_local_model_path(ignore_existance=True)) + self.model.save_pretrained( + self.get_local_model_path(ignore_existance=True), + max_shard_size="500MiB", + ) + self.tokenizer.save_pretrained( + self.get_local_model_path(ignore_existance=True) + ) utils.koboldai_vars.modeldim = self.get_hidden_size() @@ -947,4 +2338,350 @@ class CustomGPT2HFTorchInferenceModel(HFTorchInferenceModel): else: self.model = self.model.to("cpu").float() - self.patch_causal_lm() \ No newline at end of file + self.patch_causal_lm() + + +class OpenAIAPIInferenceModel(InferenceModel): + def _load(self, save_model: bool) -> None: + self.tokenizer = self._get_tokenizer("gpt2") + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ) -> GenerationResult: + # Taken mainly from oairequest() + + decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + utils.koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + # GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround + # as the koboldai_vars.model will always be OAI + if "GooseAI" in utils.koboldai_vars.configname: + reqdata = { + "prompt": decoded_prompt, + "max_tokens": max_new, + "temperature": gen_settings.temp, + "top_a": gen_settings.top_a, + "top_p": gen_settings.top_p, + "top_k": gen_settings.top_k, + "tfs": gen_settings.tfs, + "typical_p": gen_settings.typical, + "repetition_penalty": gen_settings.rep_pen, + "repetition_penalty_slope": gen_settings.rep_pen_slope, + "repetition_penalty_range": gen_settings.rep_pen_range, + "n": batch_count, + # TODO: Implement streaming + "stream": False, + } + else: + reqdata = { + "prompt": decoded_prompt, + "max_tokens": max_new, + "temperature": gen_settings.temp, + "top_p": gen_settings.top_p, + "n": batch_count, + "stream": False, + } + + req = requests.post( + utils.koboldai_vars.oaiurl, + json=reqdata, + headers={ + "Authorization": "Bearer " + utils.koboldai_vars.oaiapikey, + "Content-Type": "application/json", + }, + ) + + j = req.json() + + if not req.ok: + # Send error message to web client + if "error" in j: + error_type = j["error"]["type"] + error_message = j["error"]["message"] + else: + error_type = "Unknown" + error_message = "Unknown" + raise OpenAIAPIError(error_type, error_message) + + outputs = [out["text"] for out in j["choices"]] + return GenerationResult( + model=self, + out_batches=np.array([self.tokenizer.encode(x) for x in outputs]), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) + + +class HordeInferenceModel(InferenceModel): + def _load(self, save_model: bool) -> None: + self.tokenizer = self._get_tokenizer( + utils.koboldai_vars.cluster_requested_models[0] + if len(utils.koboldai_vars.cluster_requested_models) > 0 + else "gpt2", + ) + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ) -> GenerationResult: + decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + utils.koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + reqdata = { + "max_length": max_new, + "max_context_length": utils.koboldai_vars.max_length, + "rep_pen": gen_settings.rep_pen, + "rep_pen_slope": gen_settings.rep_pen_slope, + "rep_pen_range": gen_settings.rep_pen_range, + "temperature": gen_settings.temp, + "top_p": gen_settings.top_p, + "top_k": int(gen_settings.top_k), + "top_a": gen_settings.top_a, + "tfs": gen_settings.tfs, + "typical": gen_settings.typical, + "n": batch_count, + } + + cluster_metadata = { + "prompt": decoded_prompt, + "params": reqdata, + "models": [x for x in utils.koboldai_vars.cluster_requested_models if x], + "trusted_workers": False, + } + + cluster_headers = {"apikey": utils.koboldai_vars.apikey} + + try: + # Create request + req = requests.post( + utils.koboldai_vars.colaburl[:-8] + "/api/v2/generate/async", + json=cluster_metadata, + headers=cluster_headers, + ) + except requests.exceptions.ConnectionError: + errmsg = f"Horde unavailable. Please try again later" + logger.error(errmsg) + raise HordeException(errmsg) + + if req.status_code == 503: + errmsg = f"KoboldAI API Error: No available KoboldAI servers found in Horde to fulfil this request using the selected models or other properties." + logger.error(errmsg) + raise HordeException(errmsg) + elif not req.ok: + errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." + logger.error(errmsg) + logger.error(f"HTTP {req.status_code}!!!") + logger.error(req.text) + raise HordeException(errmsg) + + try: + js = req.json() + except requests.exceptions.JSONDecodeError: + errmsg = f"Unexpected message received from the Horde: '{req.text}'" + logger.error(errmsg) + raise HordeException(errmsg) + + request_id = js["id"] + logger.debug("Horde Request ID: {}".format(request_id)) + + # We've sent the request and got the ID back, now we need to watch it to see when it finishes + finished = False + + while not finished: + try: + req = requests.get( + utils.koboldai_vars.colaburl[:-8] + + "/api/v1/generate/check/" + + request_id + ) + except requests.exceptions.ConnectionError: + errmsg = f"Horde unavailable. Please try again later" + logger.error(errmsg) + raise HordeException(errmsg) + + if not req.ok: + errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." + logger.error(req.text) + raise HordeException(errmsg) + + try: + js = req.json() + except requests.exceptions.JSONDecodeError: + errmsg = ( + f"Unexpected message received from the KoboldAI Horde: '{req.text}'" + ) + logger.error(errmsg) + raise HordeException(errmsg) + + if "done" not in js: + errmsg = f"Unexpected response received from the KoboldAI Horde: '{js}'" + logger.error(errmsg) + raise HordeException(errmsg) + + finished = js["done"] + utils.koboldai_vars.horde_wait_time = js["wait_time"] + utils.koboldai_vars.horde_queue_position = js["queue_position"] + utils.koboldai_vars.horde_queue_size = js["waiting"] + + if not finished: + logger.debug(js) + time.sleep(1) + + logger.debug("Last Horde Status Message: {}".format(js)) + js = requests.get( + utils.koboldai_vars.colaburl[:-8] + "/api/v1/generate/prompt/" + request_id + ).json()["generations"] + logger.debug("Horde Result: {}".format(js)) + + gen_servers = [(cgen["server_name"], cgen["server_id"]) for cgen in js] + logger.info(f"Generations by: {gen_servers}") + + # TODO: Fix this, using tpool so it's a context error + # Just in case we want to announce it to the user + # if len(js) == 1: + # warnmsg = f"Text generated by {js[0]['server_name']}" + # emit('from_server', {'cmd': 'warnmsg', 'data': warnmsg}, broadcast=True) + + return GenerationResult( + model=self, + out_batches=np.array([self.tokenizer.encode(cgen["text"]) for cgen in js]), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) + + +class ColabInferenceModel(InferenceModel): + def _load(self, save_model: bool) -> None: + self.tokenizer = self._get_tokenizer("EleutherAI/gpt-neo-2.7B") + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ): + decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + utils.koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + reqdata = { + "text": decoded_prompt, + "min": 0, + "max": max_new, + "rep_pen": gen_settings.rep_pen, + "rep_pen_slope": gen_settings.rep_pen_slope, + "rep_pen_range": gen_settings.rep_pen_range, + "temperature": gen_settings.temp, + "top_p": gen_settings.top_p, + "top_k": gen_settings.top_k, + "tfs": gen_settings.tfs, + "typical": gen_settings.typical, + "topa": gen_settings.top_a, + "numseqs": batch_count, + "retfultxt": False, + } + + # Create request + req = requests.post(utils.koboldai_vars.colaburl, json=reqdata) + + if req.status_code != 200: + raise ColabException(f"Bad status code {req.status_code}") + + # Deal with the response + js = req.json()["data"] + + # Try to be backwards compatible with outdated colab + if "text" in js: + genout = [utils.getnewcontent(js["text"], self.tokenizer)] + else: + genout = js["seqs"] + + return GenerationResult( + model=self, + out_batches=np.array([self.tokenizer.encode(x) for x in genout]), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) + + +class ColabInferenceModel(InferenceModel): + def _load(self, save_model: bool) -> None: + self.tokenizer = self._get_tokenizer("EleutherAI/gpt-neo-2.7B") + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + ): + decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + utils.koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + reqdata = { + "prompt": decoded_prompt, + "max_length": max_new, + "max_context_length": utils.koboldai_vars.max_length, + "rep_pen": gen_settings.rep_pen, + "rep_pen_slope": gen_settings.rep_pen_slope, + "rep_pen_range": gen_settings.rep_pen_range, + "temperature": gen_settings.temp, + "top_p": gen_settings.top_p, + "top_k": gen_settings.top_k, + "top_a": gen_settings.top_a, + "tfs": gen_settings.tfs, + "typical": gen_settings.typical, + "n": batch_count, + } + + # Create request + while True: + req = requests.post( + utils.koboldai_vars.colaburl[:-8] + "/api/v1/generate", + json=reqdata, + ) + if ( + req.status_code == 503 + ): # Server is currently generating something else so poll until it's our turn + time.sleep(1) + continue + + js = req.json() + if req.status_code != 200: + logger.error(json.dumps(js, indent=4)) + raise APIException(f"Bad API status code {req.status_code}") + + genout = [obj["text"] for obj in js["results"]] + return GenerationResult( + model=self, + out_batches=np.array([self.tokenizer.encode(x) for x in genout]), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) diff --git a/utils.py b/utils.py index 7808729c..ae32808b 100644 --- a/utils.py +++ b/utils.py @@ -8,6 +8,7 @@ from urllib.error import HTTPError import requests import requests.adapters import time +import breakmodel from transformers import __version__ as transformers_version from transformers import PreTrainedModel import packaging.version @@ -637,6 +638,7 @@ def get_missing_module_names(model: PreTrainedModel, names: List[str]) -> List[s class UIProgressBarFile(object): """Write TQDM progress to the UI.""" + def write(self, bar): bar = bar.replace("\r", "").replace("\n", "").replace(chr(0), "") if bar != "" and [ord(num) for num in bar] != [27, 91, 65]: #No idea why we're getting the 27, 1, 65 character set, just killing to so we can move on @@ -649,4 +651,32 @@ class UIProgressBarFile(object): pass def flush(self): - pass \ No newline at end of file + pass + +def get_auxilary_device(): + """Get device auxilary tensors like inputs should be stored on.""" + + # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU. + if koboldai_vars.hascuda and koboldai_vars.usegpu: + return koboldai_vars.gpu_device + elif koboldai_vars.hascuda and koboldai_vars.breakmodel: + return breakmodel.primary_device + return "cpu" + +#==================================================================# +# Strips submitted text from the text returned by the AI +#==================================================================# +def getnewcontent(txt, tokenizer): + # If the submitted context was blank, then everything is new + if(koboldai_vars.lastctx == ""): + return txt + + # Tokenize the last context and the generated content + ctxtokens = tokenizer.encode(encodenewlines(koboldai_vars.lastctx), max_length=int(2e9), truncation=True) + txttokens = tokenizer.encode(encodenewlines(txt), max_length=int(2e9), truncation=True) + dif = (len(txttokens) - len(ctxtokens)) * -1 + + # Remove the context from the returned text + newtokens = txttokens[dif:] + + return decodenewlines(tokenizer.decode(newtokens)) \ No newline at end of file