From 807e3e88c2fb3fa26969144979a3208893cb44b4 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 14 Sep 2022 18:28:37 -0500 Subject: [PATCH 01/31] Gen split progress --- aiserver.py | 46 +++++++++++++++++++++++++++++++++++++++++++- koboldai_settings.py | 15 +++++++++++++-- 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index e8322f4a..469e79b1 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1841,6 +1841,10 @@ def patch_transformers(): scores: torch.FloatTensor, **kwargs, ) -> bool: + + if koboldai_vars.inference_config.do_dynamic_wi: + pass + koboldai_vars.generated_tkns += 1 if(not koboldai_vars.standalone and koboldai_vars.lua_koboldbridge.generated_cols and koboldai_vars.generated_tkns != koboldai_vars.lua_koboldbridge.generated_cols): raise RuntimeError(f"Inconsistency detected between KoboldAI Python and Lua backends ({koboldai_vars.generated_tkns} != {koboldai_vars.lua_koboldbridge.generated_cols})") @@ -1874,8 +1878,9 @@ def patch_transformers(): return self.regeneration_required or self.halt old_get_stopping_criteria = transformers.generation_utils.GenerationMixin._get_stopping_criteria def new_get_stopping_criteria(self, *args, **kwargs): - stopping_criteria = old_get_stopping_criteria(self, *args, **kwargs) global tokenizer + stopping_criteria = old_get_stopping_criteria(self, *args, **kwargs) + self.kai_scanner = DynamicWorldInfoScanCriteria( tokenizer=tokenizer, excluded_world_info=self.kai_scanner_excluded_world_info, @@ -2606,6 +2611,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal sendsettings() refresh_settings() + prompto = "What does 1+1 equal?\n" + print("Hehe") + out = raw_generate(prompto, 80) + print(f"{out=}") + #Saving the tokenizer to the KoboldStoryRegister class so we can do token counting on the story data if 'tokenizer' in [x for x in globals()]: koboldai_vars.tokenizer = tokenizer @@ -4619,6 +4629,40 @@ def calcsubmit(txt): # Send it! ikrequest(subtxt) +def raw_generate( + prompt: str, + max_length: int, + + do_streaming: bool = False, + do_dynamic_wi: bool = False, +): + + koboldai_vars.inference_config.do_streaming = do_streaming + koboldai_vars.inference_config.do_dynamic_wi = do_dynamic_wi + + prompt_tokens = tokenizer.encode(prompt) + gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] + + device = "cpu" + if koboldai_vars.hascuda and koboldai_vars.usegpu: + device = koboldai_vars.gpu_device + elif koboldai_vars.hascuda and koboldai_vars.breakmodel: + device = breakmodel.primary_device + gen_in = gen_in.to(device) + + with torch.no_grad(): + genout = generator( + gen_in, + do_sample=True, + max_length=max_length, + repetition_penalty=1.0, + bad_words_ids=koboldai_vars.badwordsids, + use_cache=True, + ) + + text_out = tokenizer.decode(genout[0]) + return text_out + #==================================================================# # Send text to generator and deal with output #==================================================================# diff --git a/koboldai_settings.py b/koboldai_settings.py index b29db50f..931039c0 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass import os, re, time, threading, json, pickle, base64, copy, tqdm, datetime, sys from io import BytesIO from flask import has_request_context @@ -705,8 +706,8 @@ class user_settings(settings): process_variable_changes(self.socketio, self.__class__.__name__.replace("_settings", ""), name, value, old_value) class system_settings(settings): - local_only_variables = ['socketio', 'lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 'lua_koboldcore', 'regex_sl', 'acregex_ai', 'acregex_ui', 'comregex_ai', 'comregex_ui', 'sp', '_horde_pid'] - no_save_variables = ['socketio', 'lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 'lua_koboldcore', 'sp', '_horde_pid', 'horde_share', 'aibusy', 'serverstarted'] + local_only_variables = ['socketio', 'lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 'lua_koboldcore', 'regex_sl', 'acregex_ai', 'acregex_ui', 'comregex_ai', 'comregex_ui', 'sp', '_horde_pid', 'inference_config'] + no_save_variables = ['socketio', 'lua_state', 'lua_logname', 'lua_koboldbridge', 'lua_kobold', 'lua_koboldcore', 'sp', '_horde_pid', 'horde_share', 'aibusy', 'serverstarted', 'inference_config'] settings_name = "system" def __init__(self, socketio): self.socketio = socketio @@ -784,6 +785,16 @@ class system_settings(settings): self.horde_share = False self._horde_pid = None self.cookies = {} #cookies for colab since colab's URL changes, cookies are lost + + @dataclass + class _inference_config: + do_streaming: bool = False + + # NOTE: DynamicWorldInfoScanCriteria handles not only dynamic world + # info, but also max length, aborting, regeneration requests, etc + # for kobold-rooted stuff. This would be nice to change in the future. + do_dynamic_wi: bool = False + self.inference_config = _inference_config() def __setattr__(self, name, value): From 257d2b124715f5d614db3e3f0c1801c43f7102ae Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 15 Sep 2022 16:57:55 -0500 Subject: [PATCH 02/31] Gen work --- aiserver.py | 115 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 23 deletions(-) diff --git a/aiserver.py b/aiserver.py index 469e79b1..6e848a2b 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1807,21 +1807,14 @@ def patch_transformers(): scores: torch.FloatTensor, **kwargs, ) -> bool: - if (not koboldai_vars.output_streaming): + if not koboldai_vars.inference_config.do_dynamic_wi: return False - #for batch, ids in enumerate(input_ids): - #tokenizer_text = utils.decodenewlines(tokenizer.decode(ids[-1])) - #koboldai_vars.actions.stream_token(tokenizer_text, batch=batch) - - if koboldai_vars.output_streaming: - koboldai_vars.actions.stream_tokens([utils.decodenewlines(tokenizer.decode(x[-1])) for x in input_ids]) - #if len(input_ids) > 1: - # koboldai_vars.actions.clear_unused_options() - # koboldai_vars.actions.append_options([utils.decodenewlines(tokenizer.decode(x[-1])) for x in input_ids]) - #else: - # koboldai_vars.actions[koboldai_vars.actions.action_count+1] = utils.decodenewlines(tokenizer.decode(input_ids[0, -1])) - + if not koboldai_vars.output_streaming: + return False + + koboldai_vars.actions.stream_tokens([utils.decodenewlines(tokenizer.decode(x[-1])) for x in input_ids]) + return False # Sets up dynamic world info scanner @@ -1842,8 +1835,8 @@ def patch_transformers(): **kwargs, ) -> bool: - if koboldai_vars.inference_config.do_dynamic_wi: - pass + if not koboldai_vars.inference_config.do_dynamic_wi: + return False koboldai_vars.generated_tkns += 1 if(not koboldai_vars.standalone and koboldai_vars.lua_koboldbridge.generated_cols and koboldai_vars.generated_tkns != koboldai_vars.lua_koboldbridge.generated_cols): @@ -2611,11 +2604,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal sendsettings() refresh_settings() - prompto = "What does 1+1 equal?\n" - print("Hehe") - out = raw_generate(prompto, 80) - print(f"{out=}") - #Saving the tokenizer to the KoboldStoryRegister class so we can do token counting on the story data if 'tokenizer' in [x for x in globals()]: koboldai_vars.tokenizer = tokenizer @@ -4633,6 +4621,68 @@ def raw_generate( prompt: str, max_length: int, + do_streaming: bool = False, + do_dynamic_wi: bool = False, +): + prompt_tokens = tokenizer.encode(prompt) + + if koboldai_vars.model == "Colab": + raise NotImplementedError("Colab API raw_generate unsupported") + elif koboldai_vars.model == "API": + raise NotImplementedError("API raw_generate unsupported") + elif koboldai_vars.model == "CLUSTER": + raise NotImplementedError("Cluster raw_generate unsupported") + elif koboldai_vars.model == "OAI": + raise NotImplementedError("OpenAI raw_generate unsupported") + elif koboldai_vars.model == "ReadOnly": + raise NotImplementedError("No loaded model") + + if koboldai_vars.use_colab_tpu or model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): + out_tokens = tpu_raw_generate( + prompt_tokens=prompt_tokens, + max_length=max_length, + ) + else: + out_tokens = torch_raw_generate( + prompt_tokens=prompt_tokens, + max_length=max_length, + do_streaming=do_streaming, + do_dynamic_wi=do_dynamic_wi, + ) + + return utils.decodenewlines(tokenizer.decode(out_tokens)) + +def tpu_raw_generate( + prompt_tokens: list[int], + max_length: int +): + # Mostly lifted from apiactionsubmit_tpumtjgenerate + soft_tokens = tpumtjgetsofttokens() + genout = tpool.execute( + tpu_mtj_backend.infer_static, + np.uint32(prompt_tokens), + gen_len = max_length, + temp=koboldai_vars.temp, + top_p=koboldai_vars.top_p, + top_k=koboldai_vars.top_k, + tfs=koboldai_vars.tfs, + typical=koboldai_vars.typical, + top_a=koboldai_vars.top_a, + numseqs=1, + repetition_penalty=koboldai_vars.rep_pen, + rpslope=koboldai_vars.rep_pen_slope, + rprange=koboldai_vars.rep_pen_range, + soft_embeddings=koboldai_vars.sp, + soft_tokens=soft_tokens, + sampler_order=koboldai_vars.sampler_order, + ) + + return genout[0] + +def torch_raw_generate( + prompt_tokens: list[int], + max_length: int, + do_streaming: bool = False, do_dynamic_wi: bool = False, ): @@ -4640,7 +4690,9 @@ def raw_generate( koboldai_vars.inference_config.do_streaming = do_streaming koboldai_vars.inference_config.do_dynamic_wi = do_dynamic_wi - prompt_tokens = tokenizer.encode(prompt) + # Makes stopping criteria hook happy + model.kai_scanner_excluded_world_info = [] + gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] device = "cpu" @@ -4660,8 +4712,7 @@ def raw_generate( use_cache=True, ) - text_out = tokenizer.decode(genout[0]) - return text_out + return genout[0] #==================================================================# # Send text to generator and deal with output @@ -7871,6 +7922,24 @@ def UI_2_save_cookies(data): with open("./settings/cookies.settings", "w") as f: json.dump(koboldai_vars.cookies, f) +@app.route("/generate_raw", methods=["GET"]) +def UI_2_generate_raw(): + prompt = request.args.get("prompt") + + if not prompt: + return Response(json.dumps({"error": "No prompt"}), status=400) + + if not model: + return Response(json.dumps({"error": "No model"}), status=500) + + try: + out = raw_generate(prompt, max_length=80) + except NotImplementedError as e: + return Response(json.dumps({"error": str(e)}), status=500) + + print(f"{out=}") + return out + #==================================================================# # Load Tweaks #==================================================================# From f075ca909531e61dc2c3ffc0bd81b232a0b77991 Mon Sep 17 00:00:00 2001 From: somebody Date: Fri, 16 Sep 2022 17:12:47 -0500 Subject: [PATCH 03/31] Gen work --- aiserver.py | 65 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 9 deletions(-) diff --git a/aiserver.py b/aiserver.py index 6e848a2b..67635a57 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4617,14 +4617,56 @@ def calcsubmit(txt): # Send it! ikrequest(subtxt) +def legacy_generate(text): + # Architected after oairequest + + koboldai_vars.lastctx = text + + outputs = raw_generate(text) + + # Lua bridge, genmod + for i, output in enumerate(outputs): + koboldai_vars.lua_koboldbridge.outputs[i + 1] = output + + execute_genmod() + + if koboldai_vars.lua_koboldbridge.regeneration_required: + koboldai_vars.lua_koboldbridge.regeneration_required = False + genout = [] + for i in range(len(outputs)): + out = koboldai_vars.lua_koboldbridge.outputs[i + 1] + genout.append({"generated_text": out}) + assert isinstance(out, str) + else: + genout = [{"generated_text": utils.decodenewlines(x)} for x in outputs] + + koboldai_vars.actions.append_options([applyoutputformatting(x["generated_text"]) for x in genout]) + genout = [{"generated_text": x['text']} for x in koboldai_vars.actions.get_current_options()] + + if len(genout) == 1: + genresult(genout[0]["generated_text"]) + else: + restart_seq = koboldai_vars.lua_koboldbridge.restart_sequence + if restart_seq and restart_seq > 0: + genresult(genout[restart_seq - 1]["generated_text"]) + else: + genselect(genout) + set_aibusy(0) + def raw_generate( - prompt: str, + # prompt is either a string (text) or a list (token ids) + prompt: Union[str, list], max_length: int, do_streaming: bool = False, do_dynamic_wi: bool = False, -): - prompt_tokens = tokenizer.encode(prompt) + batch_count: int = 1, +) -> List: + + if isinstance(prompt, str): + prompt_tokens = tokenizer.encode(prompt) + else: + prompt_tokens = prompt if koboldai_vars.model == "Colab": raise NotImplementedError("Colab API raw_generate unsupported") @@ -4638,23 +4680,26 @@ def raw_generate( raise NotImplementedError("No loaded model") if koboldai_vars.use_colab_tpu or model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): - out_tokens = tpu_raw_generate( + batch_out = tpu_raw_generate( prompt_tokens=prompt_tokens, max_length=max_length, + batch_count=batch_count ) else: - out_tokens = torch_raw_generate( + batch_out = torch_raw_generate( prompt_tokens=prompt_tokens, max_length=max_length, do_streaming=do_streaming, do_dynamic_wi=do_dynamic_wi, + batch_count=batch_count ) - return utils.decodenewlines(tokenizer.decode(out_tokens)) + return [utils.decodenewlines(tokenizer.decode(x)) for x in batch_out] def tpu_raw_generate( prompt_tokens: list[int], - max_length: int + max_length: int, + batch_count: int, ): # Mostly lifted from apiactionsubmit_tpumtjgenerate soft_tokens = tpumtjgetsofttokens() @@ -4668,7 +4713,7 @@ def tpu_raw_generate( tfs=koboldai_vars.tfs, typical=koboldai_vars.typical, top_a=koboldai_vars.top_a, - numseqs=1, + numseqs=batch_count, repetition_penalty=koboldai_vars.rep_pen, rpslope=koboldai_vars.rep_pen_slope, rprange=koboldai_vars.rep_pen_range, @@ -4677,7 +4722,7 @@ def tpu_raw_generate( sampler_order=koboldai_vars.sampler_order, ) - return genout[0] + return genout def torch_raw_generate( prompt_tokens: list[int], @@ -4685,6 +4730,7 @@ def torch_raw_generate( do_streaming: bool = False, do_dynamic_wi: bool = False, + batch_count: int = 1 ): koboldai_vars.inference_config.do_streaming = do_streaming @@ -4710,6 +4756,7 @@ def torch_raw_generate( repetition_penalty=1.0, bad_words_ids=koboldai_vars.badwordsids, use_cache=True, + num_return_sequences=batch_count, ) return genout[0] From 8ffc084ef307489522666a571fd97b04bb7d1dd9 Mon Sep 17 00:00:00 2001 From: somebody Date: Fri, 16 Sep 2022 20:16:11 -0500 Subject: [PATCH 04/31] Eternal gen work --- aiserver.py | 79 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/aiserver.py b/aiserver.py index 67635a57..b219dfd2 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4534,32 +4534,27 @@ def calcsubmit(txt): print("Using Alt Gen") else: subtxt, min, max = calcsubmitbudget(actionlen, winfo, mem, anotetxt, koboldai_vars.actions, submission=txt) - if(actionlen == 0): - if(not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ["Colab", "API", "CLUSTER", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): - generate(subtxt, min, max, found_entries=found_entries) - elif(koboldai_vars.model == "Colab"): - sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif(koboldai_vars.model == "API"): - sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif(koboldai_vars.model == "CLUSTER"): - sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif(koboldai_vars.model == "OAI"): - oairequest(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif(koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")): - tpumtjgenerate(subtxt, min, max, found_entries=found_entries) - else: - if(not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ["Colab", "API", "CLUSTER", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): - generate(subtxt, min, max, found_entries=found_entries) - elif(koboldai_vars.model == "Colab"): - sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif(koboldai_vars.model == "API"): - sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif(koboldai_vars.model == "CLUSTER"): - sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif(koboldai_vars.model == "OAI"): - oairequest(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif(koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")): - tpumtjgenerate(subtxt, min, max, found_entries=found_entries) + + if not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ( + "Colab", + "API", + "CLUSTER", + "OAI", + "TPUMeshTransformerGPTJ", + "TPUMeshTransformerGPTNeoX" + ): + legacy_generate(subtxt, min, max) + # generate(subtxt, min, max, found_entries=found_entries) + elif koboldai_vars.model == "Colab": + sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) + elif koboldai_vars.model == "API": + sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) + elif koboldai_vars.model == "CLUSTER": + sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) + elif koboldai_vars.model == "OAI": + oairequest(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) + elif koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): + tpumtjgenerate(subtxt, min, max, found_entries=found_entries) # For InferKit web API else: @@ -4617,18 +4612,28 @@ def calcsubmit(txt): # Send it! ikrequest(subtxt) -def legacy_generate(text): +def legacy_generate(text: Union[str, list], min: int, max: int): # Architected after oairequest koboldai_vars.lastctx = text - outputs = raw_generate(text) + print("Pregen") + print(koboldai_vars.max_length) + outputs = raw_generate( + text, + max_length=koboldai_vars.genamt, + do_streaming=True + ) + print(f"postgen: {outputs}") # Lua bridge, genmod for i, output in enumerate(outputs): koboldai_vars.lua_koboldbridge.outputs[i + 1] = output + print("post lua") + execute_genmod() + print("post genmod") if koboldai_vars.lua_koboldbridge.regeneration_required: koboldai_vars.lua_koboldbridge.regeneration_required = False @@ -4639,10 +4644,14 @@ def legacy_generate(text): assert isinstance(out, str) else: genout = [{"generated_text": utils.decodenewlines(x)} for x in outputs] + + print("post assign genout") koboldai_vars.actions.append_options([applyoutputformatting(x["generated_text"]) for x in genout]) genout = [{"generated_text": x['text']} for x in koboldai_vars.actions.get_current_options()] + print("post genout assign") + if len(genout) == 1: genresult(genout[0]["generated_text"]) else: @@ -4651,7 +4660,9 @@ def legacy_generate(text): genresult(genout[restart_seq - 1]["generated_text"]) else: genselect(genout) + print("post whatever that is") set_aibusy(0) + print("post busy") def raw_generate( # prompt is either a string (text) or a list (token ids) @@ -4697,7 +4708,7 @@ def raw_generate( return [utils.decodenewlines(tokenizer.decode(x)) for x in batch_out] def tpu_raw_generate( - prompt_tokens: list[int], + prompt_tokens: List[int], max_length: int, batch_count: int, ): @@ -4725,7 +4736,7 @@ def tpu_raw_generate( return genout def torch_raw_generate( - prompt_tokens: list[int], + prompt_tokens: List[int], max_length: int, do_streaming: bool = False, @@ -4748,7 +4759,10 @@ def torch_raw_generate( device = breakmodel.primary_device gen_in = gen_in.to(device) + print("okay...") + with torch.no_grad(): + print(f"in {max_length}") genout = generator( gen_in, do_sample=True, @@ -4758,8 +4772,10 @@ def torch_raw_generate( use_cache=True, num_return_sequences=batch_count, ) + print("out") + print("wtf") - return genout[0] + return genout #==================================================================# # Send text to generator and deal with output @@ -7984,7 +8000,6 @@ def UI_2_generate_raw(): except NotImplementedError as e: return Response(json.dumps({"error": str(e)}), status=500) - print(f"{out=}") return out #==================================================================# From 636207bfacfe7a82ec0449080ebbc63e62fa82e5 Mon Sep 17 00:00:00 2001 From: somebody Date: Sat, 17 Sep 2022 20:33:38 -0500 Subject: [PATCH 05/31] Gen gen gen --- aiserver.py | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/aiserver.py b/aiserver.py index b219dfd2..aa93a301 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1807,11 +1807,13 @@ def patch_transformers(): scores: torch.FloatTensor, **kwargs, ) -> bool: - if not koboldai_vars.inference_config.do_dynamic_wi: + if not koboldai_vars.inference_config.do_streaming: return False if not koboldai_vars.output_streaming: return False + + print([utils.decodenewlines(tokenizer.decode(x[-1])) for x in input_ids]) koboldai_vars.actions.stream_tokens([utils.decodenewlines(tokenizer.decode(x[-1])) for x in input_ids]) @@ -4617,23 +4619,17 @@ def legacy_generate(text: Union[str, list], min: int, max: int): koboldai_vars.lastctx = text - print("Pregen") - print(koboldai_vars.max_length) outputs = raw_generate( text, max_length=koboldai_vars.genamt, do_streaming=True ) - print(f"postgen: {outputs}") # Lua bridge, genmod for i, output in enumerate(outputs): koboldai_vars.lua_koboldbridge.outputs[i + 1] = output - print("post lua") - execute_genmod() - print("post genmod") if koboldai_vars.lua_koboldbridge.regeneration_required: koboldai_vars.lua_koboldbridge.regeneration_required = False @@ -4644,14 +4640,10 @@ def legacy_generate(text: Union[str, list], min: int, max: int): assert isinstance(out, str) else: genout = [{"generated_text": utils.decodenewlines(x)} for x in outputs] - - print("post assign genout") koboldai_vars.actions.append_options([applyoutputformatting(x["generated_text"]) for x in genout]) genout = [{"generated_text": x['text']} for x in koboldai_vars.actions.get_current_options()] - print("post genout assign") - if len(genout) == 1: genresult(genout[0]["generated_text"]) else: @@ -4660,9 +4652,7 @@ def legacy_generate(text: Union[str, list], min: int, max: int): genresult(genout[restart_seq - 1]["generated_text"]) else: genselect(genout) - print("post whatever that is") set_aibusy(0) - print("post busy") def raw_generate( # prompt is either a string (text) or a list (token ids) @@ -4699,13 +4689,15 @@ def raw_generate( else: batch_out = torch_raw_generate( prompt_tokens=prompt_tokens, - max_length=max_length, + max_new=max_length, do_streaming=do_streaming, do_dynamic_wi=do_dynamic_wi, batch_count=batch_count ) + + decoded = tokenizer.batch_decode(batch_out[:, len(prompt_tokens):]) - return [utils.decodenewlines(tokenizer.decode(x)) for x in batch_out] + return [utils.decodenewlines(x) for x in decoded] def tpu_raw_generate( prompt_tokens: List[int], @@ -4737,7 +4729,7 @@ def tpu_raw_generate( def torch_raw_generate( prompt_tokens: List[int], - max_length: int, + max_new: int, do_streaming: bool = False, do_dynamic_wi: bool = False, @@ -4759,21 +4751,16 @@ def torch_raw_generate( device = breakmodel.primary_device gen_in = gen_in.to(device) - print("okay...") - with torch.no_grad(): - print(f"in {max_length}") genout = generator( gen_in, do_sample=True, - max_length=max_length, + max_length=min(len(prompt_tokens) + max_new, koboldai_vars.max_length), repetition_penalty=1.0, bad_words_ids=koboldai_vars.badwordsids, use_cache=True, num_return_sequences=batch_count, ) - print("out") - print("wtf") return genout From 386477e59c98b0ef4bd5ef0c20ba87692ccea5fc Mon Sep 17 00:00:00 2001 From: somebody Date: Sat, 17 Sep 2022 20:47:44 -0500 Subject: [PATCH 06/31] Fix token streaming --- aiserver.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/aiserver.py b/aiserver.py index e1a7ac12..08a07183 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2009,11 +2009,8 @@ def patch_transformers(): if not koboldai_vars.output_streaming: return False - - print([utils.decodenewlines(tokenizer.decode(x[-1])) for x in input_ids]) koboldai_vars.actions.stream_tokens([utils.decodenewlines(tokenizer.decode(x[-1])) for x in input_ids]) - return False # Sets up dynamic world info scanner @@ -4744,7 +4741,6 @@ def calcsubmit(txt): "TPUMeshTransformerGPTNeoX" ): legacy_generate(subtxt, min, max) - # generate(subtxt, min, max, found_entries=found_entries) elif koboldai_vars.model == "Colab": sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) elif koboldai_vars.model == "API": @@ -4813,11 +4809,10 @@ def calcsubmit(txt): ikrequest(subtxt) def legacy_generate(text: Union[str, list], min: int, max: int): - # Architected after oairequest - koboldai_vars.lastctx = text - outputs = raw_generate( + outputs = tpool.execute( + raw_generate, text, max_length=koboldai_vars.genamt, do_streaming=True From 2ed7a9dcee05b92ae58c16228b3e731d6a4f5d85 Mon Sep 17 00:00:00 2001 From: somebody Date: Sun, 18 Sep 2022 22:30:43 -0500 Subject: [PATCH 07/31] Eternal gen work --- aiserver.py | 157 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 123 insertions(+), 34 deletions(-) diff --git a/aiserver.py b/aiserver.py index 08a07183..85767c09 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2013,6 +2013,49 @@ def patch_transformers(): koboldai_vars.actions.stream_tokens([utils.decodenewlines(tokenizer.decode(x[-1])) for x in input_ids]) return False + class CoreStopper(StoppingCriteria): + # Controls core generation stuff; aborting, counting generated tokens, etc + def __init__(self): + self.regeneration_required = False + self.halt = False + + def __call__( + self, + input_ids: torch.LongTensor, + scores: torch.FloatTensor, + **kwargs, + ) -> bool: + koboldai_vars.generated_tkns += 1 + + if ( + not koboldai_vars.standalone + and koboldai_vars.lua_koboldbridge.generated_cols + and koboldai_vars.generated_tkns != koboldai_vars.lua_koboldbridge.generated_cols + ): + print("[TODO] Fix generated_cols") + # raise RuntimeError(f"Inconsistency detected between KoboldAI Python and Lua backends ({koboldai_vars.generated_tkns} != {koboldai_vars.lua_koboldbridge.generated_cols})") + + if koboldai_vars.abort: + koboldai_vars.abort = False + self.regeneration_required = False + self.halt = False + return True + + if koboldai_vars.standalone: + return False + + assert input_ids.ndim == 2 + + self.regeneration_required = koboldai_vars.lua_koboldbridge.regeneration_required + self.halt = not koboldai_vars.lua_koboldbridge.generating + koboldai_vars.lua_koboldbridge.regeneration_required = False + + for i in range(koboldai_vars.numseqs): + koboldai_vars.lua_koboldbridge.generated[i+1][koboldai_vars.generated_tkns] = int(input_ids[i, -1].item()) + + return self.regeneration_required or self.halt + + # Sets up dynamic world info scanner class DynamicWorldInfoScanCriteria(StoppingCriteria): def __init__( @@ -2024,6 +2067,7 @@ def patch_transformers(): self.halt = False self.tokenizer = tokenizer self.excluded_world_info = excluded_world_info + def __call__( self, input_ids: torch.LongTensor, @@ -2034,35 +2078,38 @@ def patch_transformers(): if not koboldai_vars.inference_config.do_dynamic_wi: return False - koboldai_vars.generated_tkns += 1 - if(not koboldai_vars.standalone and koboldai_vars.lua_koboldbridge.generated_cols and koboldai_vars.generated_tkns != koboldai_vars.lua_koboldbridge.generated_cols): - raise RuntimeError(f"Inconsistency detected between KoboldAI Python and Lua backends ({koboldai_vars.generated_tkns} != {koboldai_vars.lua_koboldbridge.generated_cols})") - if(koboldai_vars.abort or koboldai_vars.generated_tkns >= koboldai_vars.genamt): - self.regeneration_required = False - self.halt = False - koboldai_vars.abort = False - return True - if(koboldai_vars.standalone): + # if(koboldai_vars.abort or koboldai_vars.generated_tkns >= koboldai_vars.genamt): + # self.regeneration_required = False + # self.halt = False + # koboldai_vars.abort = False + # return True + + # Pertains to WI I think + # if(koboldai_vars.standalone): + # return False + + # assert input_ids.ndim == 2 + assert len(self.excluded_world_info) == input_ids.shape[0] + # self.regeneration_required = koboldai_vars.lua_koboldbridge.regeneration_required + # self.halt = not koboldai_vars.lua_koboldbridge.generating + # koboldai_vars.lua_koboldbridge.regeneration_required = False + + # for i in range(koboldai_vars.numseqs): + # koboldai_vars.lua_koboldbridge.generated[i+1][koboldai_vars.generated_tkns] = int(input_ids[i, -1].item()) + + if not koboldai_vars.dynamicscan: + #return self.regeneration_required or self.halt return False - assert input_ids.ndim == 2 - assert len(self.excluded_world_info) == input_ids.shape[0] - self.regeneration_required = koboldai_vars.lua_koboldbridge.regeneration_required - self.halt = not koboldai_vars.lua_koboldbridge.generating - koboldai_vars.lua_koboldbridge.regeneration_required = False - - for i in range(koboldai_vars.numseqs): - koboldai_vars.lua_koboldbridge.generated[i+1][koboldai_vars.generated_tkns] = int(input_ids[i, -1].item()) - - if(not koboldai_vars.dynamicscan): - return self.regeneration_required or self.halt tail = input_ids[..., -koboldai_vars.generated_tkns:] for i, t in enumerate(tail): decoded = utils.decodenewlines(tokenizer.decode(t)) _, found = checkworldinfo(decoded, force_use_txt=True, actions=koboldai_vars.actions) found -= self.excluded_world_info[i] - if(len(found) != 0): - self.regeneration_required = True + if len(found) != 0: + # self.regeneration_required = True + model.core_stopper.regeneration_required = True + return True break return self.regeneration_required or self.halt old_get_stopping_criteria = transformers.generation_utils.GenerationMixin._get_stopping_criteria @@ -2070,12 +2117,14 @@ def patch_transformers(): global tokenizer stopping_criteria = old_get_stopping_criteria(self, *args, **kwargs) + self.core_stopper = CoreStopper() self.kai_scanner = DynamicWorldInfoScanCriteria( tokenizer=tokenizer, excluded_world_info=self.kai_scanner_excluded_world_info, ) token_streamer = TokenStreamer(tokenizer=tokenizer) + stopping_criteria.insert(0, self.core_stopper) stopping_criteria.insert(0, self.kai_scanner) token_streamer = TokenStreamer(tokenizer=tokenizer) stopping_criteria.insert(0, token_streamer) @@ -4811,28 +4860,36 @@ def calcsubmit(txt): def legacy_generate(text: Union[str, list], min: int, max: int): koboldai_vars.lastctx = text - outputs = tpool.execute( + out_batches = tpool.execute( raw_generate, text, max_length=koboldai_vars.genamt, - do_streaming=True + do_streaming=True, + batch_count=koboldai_vars.numseqs, + decode=False ) + decoded_batches = tokenizer.batch_decode(out_batches) + # Lua bridge, genmod - for i, output in enumerate(outputs): - koboldai_vars.lua_koboldbridge.outputs[i + 1] = output + for i in range(koboldai_vars.numseqs): + koboldai_vars.lua_koboldbridge.generated[i + 1][koboldai_vars.generated_tkns] = int(out_batches[i, -1].item()) + koboldai_vars.lua_koboldbridge.outputs[i + 1] = utils.decodenewlines(tokenizer.decode(out_batches[i, -len(out_batches[i]):])) + + # for i, output in enumerate(outputs): + # koboldai_vars.lua_koboldbridge.outputs[i + 1] = output execute_genmod() if koboldai_vars.lua_koboldbridge.regeneration_required: koboldai_vars.lua_koboldbridge.regeneration_required = False genout = [] - for i in range(len(outputs)): + for i in range(len(out_batches)): out = koboldai_vars.lua_koboldbridge.outputs[i + 1] genout.append({"generated_text": out}) assert isinstance(out, str) else: - genout = [{"generated_text": utils.decodenewlines(x)} for x in outputs] + genout = [{"generated_text": utils.decodenewlines(x)} for x in decoded_batches] koboldai_vars.actions.append_options([applyoutputformatting(x["generated_text"]) for x in genout]) genout = [{"generated_text": x['text']} for x in koboldai_vars.actions.get_current_options()] @@ -4855,12 +4912,17 @@ def raw_generate( do_streaming: bool = False, do_dynamic_wi: bool = False, batch_count: int = 1, + decode: bool = True, ) -> List: if isinstance(prompt, str): prompt_tokens = tokenizer.encode(prompt) else: prompt_tokens = prompt + + # Some gen methods such as OAI don't return tokens. + batch_decoded = None + batch_encoded = None if koboldai_vars.model == "Colab": raise NotImplementedError("Colab API raw_generate unsupported") @@ -4874,13 +4936,15 @@ def raw_generate( raise NotImplementedError("No loaded model") if koboldai_vars.use_colab_tpu or model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): - batch_out = tpu_raw_generate( + batch_encoded = tpu_raw_generate( prompt_tokens=prompt_tokens, max_length=max_length, batch_count=batch_count ) + elif model == "OAI": + batch_decoded = ... else: - batch_out = torch_raw_generate( + batch_encoded = torch_raw_generate( prompt_tokens=prompt_tokens, max_new=max_length, do_streaming=do_streaming, @@ -4888,9 +4952,21 @@ def raw_generate( batch_count=batch_count ) - decoded = tokenizer.batch_decode(batch_out[:, len(prompt_tokens):]) - return [utils.decodenewlines(x) for x in decoded] + assert batch_encoded or batch_decoded + + # Shave prompt off of encoded response. Decoded does not return prompt. + # TODO: Does MTJ generation shave this off automatically? Test it! + if batch_encoded: + batch_encoded = batch_encoded[:, len(prompt_tokens):] + + if not decode: + return batch_encoded + + if not batch_decoded: + batch_decoded = tokenizer.batch_decode(batch_encoded) + + return [utils.decodenewlines(x) for x in batch_decoded] def tpu_raw_generate( prompt_tokens: List[int], @@ -4961,7 +5037,7 @@ def torch_raw_generate( # Send text to generator and deal with output #==================================================================# -def _generate(txt, minimum, maximum, found_entries): +def old_underscore_generate(txt, minimum, maximum, found_entries): if(koboldai_vars.full_determinism): torch.manual_seed(koboldai_vars.seed) @@ -5000,19 +5076,30 @@ def _generate(txt, minimum, maximum, found_entries): ) already_generated += len(genout[0]) - len(gen_in[0]) assert already_generated <= koboldai_vars.genamt + # If we are halting, we stop if(model.kai_scanner.halt or not model.kai_scanner.regeneration_required): break + # if we require a generation, we continue + assert genout.ndim >= 2 assert genout.shape[0] == koboldai_vars.numseqs + if(koboldai_vars.lua_koboldbridge.generated_cols and koboldai_vars.generated_tkns != koboldai_vars.lua_koboldbridge.generated_cols): raise RuntimeError("Inconsistency detected between KoboldAI Python and Lua backends") + if(already_generated != koboldai_vars.generated_tkns): raise RuntimeError("WI scanning error") + for r in range(koboldai_vars.numseqs): for c in range(already_generated): assert koboldai_vars.lua_koboldbridge.generated[r+1][c+1] is not None genout[r][genout.shape[-1] - already_generated + c] = koboldai_vars.lua_koboldbridge.generated[r+1][c+1] + encoded = [] + + # DYNAMIC WI: + # IF WE FIND WORLD INFO MID-GENERATION, STOP, THEN ADD WI AND ADD NEW GENERATION + for i in range(koboldai_vars.numseqs): txt = utils.decodenewlines(tokenizer.decode(genout[i, -already_generated:])) winfo, mem, anotetxt, _found_entries = calcsubmitbudgetheader(txt, force_use_txt=True, actions=koboldai_vars.actions) @@ -5023,6 +5110,7 @@ def _generate(txt, minimum, maximum, found_entries): else: txt, _, _ = calcsubmitbudget(len(koboldai_vars.actions), winfo, mem, anotetxt, koboldai_vars.actions, submission=txt) encoded.append(torch.tensor(txt, dtype=torch.long, device=genout.device)) + max_length = len(max(encoded, key=len)) encoded = torch.stack(tuple(torch.nn.functional.pad(e, (max_length - len(e), 0), value=model.config.pad_token_id or model.config.eos_token_id) for e in encoded)) genout = torch.cat( @@ -5032,6 +5120,7 @@ def _generate(txt, minimum, maximum, found_entries): ), dim=-1 ) + if(koboldai_vars.sp is not None): soft_tokens = torch.arange( model.config.vocab_size, @@ -5049,7 +5138,7 @@ def _generate(txt, minimum, maximum, found_entries): return genout, already_generated -def generate(txt, minimum, maximum, found_entries=None): +def old_generate(txt, minimum, maximum, found_entries=None): koboldai_vars.generated_tkns = 0 if(found_entries is None): From 4d3a80e4a6d96e055548242f7967e30f96a97045 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 19 Sep 2022 21:06:14 -0500 Subject: [PATCH 08/31] Actually just steal from _generate Why I didn't do this before? It's a mystery! --- aiserver.py | 312 +++++++++++++++++++------------------------ koboldai_settings.py | 6 +- 2 files changed, 136 insertions(+), 182 deletions(-) diff --git a/aiserver.py b/aiserver.py index 85767c09..45e4b639 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2032,10 +2032,16 @@ def patch_transformers(): and koboldai_vars.lua_koboldbridge.generated_cols and koboldai_vars.generated_tkns != koboldai_vars.lua_koboldbridge.generated_cols ): - print("[TODO] Fix generated_cols") - # raise RuntimeError(f"Inconsistency detected between KoboldAI Python and Lua backends ({koboldai_vars.generated_tkns} != {koboldai_vars.lua_koboldbridge.generated_cols})") + raise RuntimeError(f"Inconsistency detected between KoboldAI Python and Lua backends ({koboldai_vars.generated_tkns} != {koboldai_vars.lua_koboldbridge.generated_cols})") - if koboldai_vars.abort: + if ( + koboldai_vars.abort + or ( + koboldai_vars.inference_config.stop_at_genamt + and + koboldai_vars.generated_tkns >= koboldai_vars.genamt + ) + ): koboldai_vars.abort = False self.regeneration_required = False self.halt = False @@ -2063,8 +2069,8 @@ def patch_transformers(): tokenizer, excluded_world_info: List[Set], ): - self.regeneration_required = False - self.halt = False + # self.regeneration_required = False + # self.halt = False self.tokenizer = tokenizer self.excluded_world_info = excluded_world_info @@ -2078,27 +2084,9 @@ def patch_transformers(): if not koboldai_vars.inference_config.do_dynamic_wi: return False - # if(koboldai_vars.abort or koboldai_vars.generated_tkns >= koboldai_vars.genamt): - # self.regeneration_required = False - # self.halt = False - # koboldai_vars.abort = False - # return True - - # Pertains to WI I think - # if(koboldai_vars.standalone): - # return False - - # assert input_ids.ndim == 2 assert len(self.excluded_world_info) == input_ids.shape[0] - # self.regeneration_required = koboldai_vars.lua_koboldbridge.regeneration_required - # self.halt = not koboldai_vars.lua_koboldbridge.generating - # koboldai_vars.lua_koboldbridge.regeneration_required = False - - # for i in range(koboldai_vars.numseqs): - # koboldai_vars.lua_koboldbridge.generated[i+1][koboldai_vars.generated_tkns] = int(input_ids[i, -1].item()) if not koboldai_vars.dynamicscan: - #return self.regeneration_required or self.halt return False tail = input_ids[..., -koboldai_vars.generated_tkns:] @@ -2107,11 +2095,10 @@ def patch_transformers(): _, found = checkworldinfo(decoded, force_use_txt=True, actions=koboldai_vars.actions) found -= self.excluded_world_info[i] if len(found) != 0: - # self.regeneration_required = True model.core_stopper.regeneration_required = True return True - break - return self.regeneration_required or self.halt + return False + old_get_stopping_criteria = transformers.generation_utils.GenerationMixin._get_stopping_criteria def new_get_stopping_criteria(self, *args, **kwargs): global tokenizer @@ -4789,7 +4776,7 @@ def calcsubmit(txt): "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX" ): - legacy_generate(subtxt, min, max) + generate(subtxt, min, max, found_entries) elif koboldai_vars.model == "Colab": sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) elif koboldai_vars.model == "API": @@ -4857,52 +4844,117 @@ def calcsubmit(txt): # Send it! ikrequest(subtxt) -def legacy_generate(text: Union[str, list], min: int, max: int): - koboldai_vars.lastctx = text +def core_generate(text: list, min: int, max: int, found_entries: set): + # This generation function is tangled with koboldai_vars intentionally. It + # is meant for the story and nothing else. - out_batches = tpool.execute( - raw_generate, - text, - max_length=koboldai_vars.genamt, - do_streaming=True, - batch_count=koboldai_vars.numseqs, - decode=False - ) + if koboldai_vars.full_determinism: + torch.manual_seed(koboldai_vars.seed) - decoded_batches = tokenizer.batch_decode(out_batches) + gen_in = torch.tensor(text, dtype=torch.long)[None] + if koboldai_vars.sp is not None: + soft_tokens = torch.arange( + model.config.vocab_size, + model.config.vocab_size + koboldai_vars.sp.shape[0], + ) + gen_in = torch.cat((soft_tokens[None], gen_in), dim=-1) - # Lua bridge, genmod - for i in range(koboldai_vars.numseqs): - koboldai_vars.lua_koboldbridge.generated[i + 1][koboldai_vars.generated_tkns] = int(out_batches[i, -1].item()) - koboldai_vars.lua_koboldbridge.outputs[i + 1] = utils.decodenewlines(tokenizer.decode(out_batches[i, -len(out_batches[i]):])) + assert gen_in.shape[-1] + koboldai_vars.genamt <= koboldai_vars.max_length - # for i, output in enumerate(outputs): - # koboldai_vars.lua_koboldbridge.outputs[i + 1] = output - - execute_genmod() - - if koboldai_vars.lua_koboldbridge.regeneration_required: - koboldai_vars.lua_koboldbridge.regeneration_required = False - genout = [] - for i in range(len(out_batches)): - out = koboldai_vars.lua_koboldbridge.outputs[i + 1] - genout.append({"generated_text": out}) - assert isinstance(out, str) + if koboldai_vars.hascuda and koboldai_vars.usegpu: + gen_in = gen_in.to(koboldai_vars.gpu_device) + elif koboldai_vars.hascuda and koboldai_vars.breakmodel: + gen_in = gen_in.to(breakmodel.primary_device) else: - genout = [{"generated_text": utils.decodenewlines(x)} for x in decoded_batches] + gen_in = gen_in.to("cpu") - koboldai_vars.actions.append_options([applyoutputformatting(x["generated_text"]) for x in genout]) - genout = [{"generated_text": x['text']} for x in koboldai_vars.actions.get_current_options()] + found_entries = found_entries or set() + model.kai_scanner_excluded_world_info = found_entries - if len(genout) == 1: - genresult(genout[0]["generated_text"]) - else: - restart_seq = koboldai_vars.lua_koboldbridge.restart_sequence - if restart_seq and restart_seq > 0: - genresult(genout[restart_seq - 1]["generated_text"]) - else: - genselect(genout) - set_aibusy(0) + koboldai_vars._prompt = koboldai_vars.prompt + + with torch.no_grad(): + already_generated = 0 + numseqs = koboldai_vars.numseqs + while True: + # The reason this is a loop is due to how Dynamic WI works. We + # cannot simply add the WI to the context mid-generation, so we + # stop early, and then insert WI, then continue generating. That + # stopping and continuing is this loop. + + genout = raw_generate( + gen_in, + # Real max length is handled by CoreStopper. + max_length=int(2e9), + do_streaming=True, + do_dynamic_wi=True, + batch_count=numseqs, + decode=False, + ) + already_generated += len(genout[0]) - len(gen_in[0]) + assert already_generated <= koboldai_vars.genamt + + # Generation stopped; why? + # If we have been told to halt, we have reached our target token + # amount (controlled by halt), or Dynamic WI has not told us to + # stop temporarily to insert WI, we can assume that we are done + # generating. We shall break. + if model.core_stopper.halt or not model.core_stopper.regeneration_required: + break + + # Now we are doing stuff for Dynamic WI. + assert genout.ndim >= 2 + assert genout.shape[0] == koboldai_vars.numseqs + + if(koboldai_vars.lua_koboldbridge.generated_cols and koboldai_vars.generated_tkns != koboldai_vars.lua_koboldbridge.generated_cols): + raise RuntimeError("Inconsistency detected between KoboldAI Python and Lua backends") + + if(already_generated != koboldai_vars.generated_tkns): + raise RuntimeError("WI scanning error") + + for r in range(koboldai_vars.numseqs): + for c in range(already_generated): + assert koboldai_vars.lua_koboldbridge.generated[r+1][c+1] is not None + genout[r][genout.shape[-1] - already_generated + c] = koboldai_vars.lua_koboldbridge.generated[r+1][c+1] + + encoded = [] + + for i in range(koboldai_vars.numseqs): + txt = utils.decodenewlines(tokenizer.decode(genout[i, -already_generated:])) + winfo, mem, anotetxt, _found_entries = calcsubmitbudgetheader(txt, force_use_txt=True, actions=koboldai_vars.actions) + found_entries[i].update(_found_entries) + if koboldai_vars.alt_gen: + txt, _, _ = koboldai_vars.calc_ai_text(submitted_text=txt) + print("Using Alt Gen") + else: + txt, _, _ = calcsubmitbudget(len(koboldai_vars.actions), winfo, mem, anotetxt, koboldai_vars.actions, submission=txt) + encoded.append(torch.tensor(txt, dtype=torch.long, device=genout.device)) + + max_length = len(max(encoded, key=len)) + encoded = torch.stack(tuple(torch.nn.functional.pad(e, (max_length - len(e), 0), value=model.config.pad_token_id or model.config.eos_token_id) for e in encoded)) + genout = torch.cat( + ( + encoded, + genout[..., -already_generated:], + ), + dim=-1 + ) + + if(koboldai_vars.sp is not None): + soft_tokens = torch.arange( + model.config.vocab_size, + model.config.vocab_size + koboldai_vars.sp.shape[0], + device=genout.device, + ) + genout = torch.cat((soft_tokens.tile(koboldai_vars.numseqs, 1), genout), dim=-1) + assert genout.shape[-1] + koboldai_vars.genamt - already_generated <= koboldai_vars.max_length + diff = genout.shape[-1] - gen_in.shape[-1] + minimum += diff + maximum += diff + gen_in = genout + numseqs = 1 + + return genout, already_generated def raw_generate( # prompt is either a string (text) or a list (token ids) @@ -4952,18 +5004,17 @@ def raw_generate( batch_count=batch_count ) - - assert batch_encoded or batch_decoded + assert batch_encoded is not None or batch_decoded is not None # Shave prompt off of encoded response. Decoded does not return prompt. # TODO: Does MTJ generation shave this off automatically? Test it! - if batch_encoded: - batch_encoded = batch_encoded[:, len(prompt_tokens):] + if batch_encoded is not None: + batch_encoded = batch_encoded[:, len(prompt_tokens) - 1:] if not decode: return batch_encoded - if not batch_decoded: + if batch_decoded is None: batch_decoded = tokenizer.batch_decode(batch_encoded) return [utils.decodenewlines(x) for x in batch_decoded] @@ -4997,7 +5048,7 @@ def tpu_raw_generate( return genout def torch_raw_generate( - prompt_tokens: List[int], + prompt_tokens: Union[List[int], torch.Tensor], max_new: int, do_streaming: bool = False, @@ -5008,10 +5059,16 @@ def torch_raw_generate( koboldai_vars.inference_config.do_streaming = do_streaming koboldai_vars.inference_config.do_dynamic_wi = do_dynamic_wi - # Makes stopping criteria hook happy - model.kai_scanner_excluded_world_info = [] + # Dynamic WI depends on this!!! This is a main gen call. + koboldai_vars.inference_config.stop_at_genamt = do_dynamic_wi - gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] + # Makes stopping criteria hook happy + model.kai_scanner_excluded_world_info = model.kai_scanner_excluded_world_info or set() + + if not isinstance(prompt_tokens, torch.Tensor): + gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] + else: + gen_in = prompt_tokens device = "cpu" if koboldai_vars.hascuda and koboldai_vars.usegpu: @@ -5037,108 +5094,7 @@ def torch_raw_generate( # Send text to generator and deal with output #==================================================================# -def old_underscore_generate(txt, minimum, maximum, found_entries): - if(koboldai_vars.full_determinism): - torch.manual_seed(koboldai_vars.seed) - - gen_in = torch.tensor(txt, dtype=torch.long)[None] - if(koboldai_vars.sp is not None): - soft_tokens = torch.arange( - model.config.vocab_size, - model.config.vocab_size + koboldai_vars.sp.shape[0], - ) - gen_in = torch.cat((soft_tokens[None], gen_in), dim=-1) - assert gen_in.shape[-1] + koboldai_vars.genamt <= koboldai_vars.max_length - - if(koboldai_vars.hascuda and koboldai_vars.usegpu): - gen_in = gen_in.to(koboldai_vars.gpu_device) - elif(koboldai_vars.hascuda and koboldai_vars.breakmodel): - gen_in = gen_in.to(breakmodel.primary_device) - else: - gen_in = gen_in.to('cpu') - - model.kai_scanner_excluded_world_info = found_entries - - koboldai_vars._prompt = koboldai_vars.prompt - - with torch.no_grad(): - already_generated = 0 - numseqs = koboldai_vars.numseqs - while True: - genout = generator( - gen_in, - do_sample=True, - max_length=int(2e9), - repetition_penalty=1.0, - bad_words_ids=koboldai_vars.badwordsids, - use_cache=True, - num_return_sequences=numseqs - ) - already_generated += len(genout[0]) - len(gen_in[0]) - assert already_generated <= koboldai_vars.genamt - # If we are halting, we stop - if(model.kai_scanner.halt or not model.kai_scanner.regeneration_required): - break - # if we require a generation, we continue - - assert genout.ndim >= 2 - assert genout.shape[0] == koboldai_vars.numseqs - - if(koboldai_vars.lua_koboldbridge.generated_cols and koboldai_vars.generated_tkns != koboldai_vars.lua_koboldbridge.generated_cols): - raise RuntimeError("Inconsistency detected between KoboldAI Python and Lua backends") - - if(already_generated != koboldai_vars.generated_tkns): - raise RuntimeError("WI scanning error") - - for r in range(koboldai_vars.numseqs): - for c in range(already_generated): - assert koboldai_vars.lua_koboldbridge.generated[r+1][c+1] is not None - genout[r][genout.shape[-1] - already_generated + c] = koboldai_vars.lua_koboldbridge.generated[r+1][c+1] - - encoded = [] - - # DYNAMIC WI: - # IF WE FIND WORLD INFO MID-GENERATION, STOP, THEN ADD WI AND ADD NEW GENERATION - - for i in range(koboldai_vars.numseqs): - txt = utils.decodenewlines(tokenizer.decode(genout[i, -already_generated:])) - winfo, mem, anotetxt, _found_entries = calcsubmitbudgetheader(txt, force_use_txt=True, actions=koboldai_vars.actions) - found_entries[i].update(_found_entries) - if koboldai_vars.alt_gen: - txt, _, _ = koboldai_vars.calc_ai_text(submitted_text=txt) - print("Using Alt Gen") - else: - txt, _, _ = calcsubmitbudget(len(koboldai_vars.actions), winfo, mem, anotetxt, koboldai_vars.actions, submission=txt) - encoded.append(torch.tensor(txt, dtype=torch.long, device=genout.device)) - - max_length = len(max(encoded, key=len)) - encoded = torch.stack(tuple(torch.nn.functional.pad(e, (max_length - len(e), 0), value=model.config.pad_token_id or model.config.eos_token_id) for e in encoded)) - genout = torch.cat( - ( - encoded, - genout[..., -already_generated:], - ), - dim=-1 - ) - - if(koboldai_vars.sp is not None): - soft_tokens = torch.arange( - model.config.vocab_size, - model.config.vocab_size + koboldai_vars.sp.shape[0], - device=genout.device, - ) - genout = torch.cat((soft_tokens.tile(koboldai_vars.numseqs, 1), genout), dim=-1) - assert genout.shape[-1] + koboldai_vars.genamt - already_generated <= koboldai_vars.max_length - diff = genout.shape[-1] - gen_in.shape[-1] - minimum += diff - maximum += diff - gen_in = genout - numseqs = 1 - - return genout, already_generated - - -def old_generate(txt, minimum, maximum, found_entries=None): +def generate(txt, minimum, maximum, found_entries=None): koboldai_vars.generated_tkns = 0 if(found_entries is None): @@ -5158,7 +5114,7 @@ def old_generate(txt, minimum, maximum, found_entries=None): # Submit input text to generator try: - genout, already_generated = tpool.execute(_generate, txt, minimum, maximum, found_entries) + genout, already_generated = tpool.execute(core_generate, txt, minimum, maximum, found_entries) except Exception as e: if(issubclass(type(e), lupa.LuaError)): koboldai_vars.lua_koboldbridge.obliterate_multiverse() diff --git a/koboldai_settings.py b/koboldai_settings.py index b4553fdd..9aa539e3 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -800,11 +800,9 @@ class system_settings(settings): @dataclass class _inference_config: do_streaming: bool = False - - # NOTE: DynamicWorldInfoScanCriteria handles not only dynamic world - # info, but also max length, aborting, regeneration requests, etc - # for kobold-rooted stuff. This would be nice to change in the future. do_dynamic_wi: bool = False + # Genamt stopping is mostly tied to Dynamic WI + stop_at_genamt: bool = False self.inference_config = _inference_config() From d07fe6dcc049dadaca24bb8f5cae8edc6e5e8f53 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 19 Sep 2022 21:50:00 -0500 Subject: [PATCH 09/31] Etc --- aiserver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiserver.py b/aiserver.py index 45e4b639..6e1efe27 100644 --- a/aiserver.py +++ b/aiserver.py @@ -2069,8 +2069,6 @@ def patch_transformers(): tokenizer, excluded_world_info: List[Set], ): - # self.regeneration_required = False - # self.halt = False self.tokenizer = tokenizer self.excluded_world_info = excluded_world_info @@ -4968,8 +4966,10 @@ def raw_generate( ) -> List: if isinstance(prompt, str): + prompt_decoded = prompt prompt_tokens = tokenizer.encode(prompt) else: + prompt_decoded = tokenizer.decode(prompt) prompt_tokens = prompt # Some gen methods such as OAI don't return tokens. From cce4369892dc3d52b0286a122cffae74d1c5483b Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 18:52:54 -0500 Subject: [PATCH 10/31] Test --- aiserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/aiserver.py b/aiserver.py index 6e1efe27..f9157a66 100644 --- a/aiserver.py +++ b/aiserver.py @@ -5,6 +5,7 @@ # By: KoboldAIDev and the KoboldAI Community #==================================================================# +print("CTest") # External packages from dataclasses import dataclass import eventlet From 383e2fc93ca73a15699e4ad21e44108d9cd05529 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 18:59:19 -0500 Subject: [PATCH 11/31] Fixes to env --- aiserver.py | 1 - requirements_mtj.txt | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/aiserver.py b/aiserver.py index f9157a66..6e1efe27 100644 --- a/aiserver.py +++ b/aiserver.py @@ -5,7 +5,6 @@ # By: KoboldAIDev and the KoboldAI Community #==================================================================# -print("CTest") # External packages from dataclasses import dataclass import eventlet diff --git a/requirements_mtj.txt b/requirements_mtj.txt index 13c797f1..c86bc31d 100644 --- a/requirements_mtj.txt +++ b/requirements_mtj.txt @@ -6,7 +6,7 @@ optax >= 0.0.5, <= 0.0.9 dm-haiku == 0.0.5 jax == 0.2.21 jaxlib >= 0.1.69, <= 0.3.7 -transformers >= 4.19 +transformers ==4.21.3 progressbar2 git+https://github.com/VE-FORBRYDERNE/mesh-transformer-jax@ck flask From 1928a7c9678433c60192d8a0112502d900c2acea Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 19:14:14 -0500 Subject: [PATCH 12/31] Gen 10000 --- aiserver.py | 266 ++++++++++++++++++++++++++-------------------------- 1 file changed, 131 insertions(+), 135 deletions(-) diff --git a/aiserver.py b/aiserver.py index 6e1efe27..8bcdf608 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4874,21 +4874,28 @@ def core_generate(text: list, min: int, max: int, found_entries: set): with torch.no_grad(): already_generated = 0 numseqs = koboldai_vars.numseqs - while True: + + do_loop = True + + while do_loop: # The reason this is a loop is due to how Dynamic WI works. We # cannot simply add the WI to the context mid-generation, so we # stop early, and then insert WI, then continue generating. That # stopping and continuing is this loop. - genout = raw_generate( + result = raw_generate( gen_in, - # Real max length is handled by CoreStopper. - max_length=int(2e9), + max_length=koboldai_vars.genamt, do_streaming=True, do_dynamic_wi=True, batch_count=numseqs, - decode=False, + # Real max length is handled by CoreStopper. + bypass_hf_maxlength=True, ) + + do_loop = not result.is_whole_generation + genout = result.encoded + already_generated += len(genout[0]) - len(gen_in[0]) assert already_generated <= koboldai_vars.genamt @@ -4954,6 +4961,25 @@ def core_generate(text: list, min: int, max: int, found_entries: set): return genout, already_generated +class GenerationResult: + def __init__( + self, + out_batches: list, + prompt: list, + + # Controls if generate() does it's looping thing. This should only be + # done for HF models that use that StoppingCondition + is_whole_generation: bool + ): + # Shave prompt off of encoded response. Decoded does not return prompt. + # TODO: Does MTJ generation shave this off automatically? Test it! + print("shape", out_batches.shape) + self.encoded = out_batches[:, len(prompt) - 1:] + self.prompt = prompt + self.is_whole_generation = is_whole_generation + + self.decoded = [utils.decodenewlines(tokenizer.decode(enc)) for enc in self.encoded] + def raw_generate( # prompt is either a string (text) or a list (token ids) prompt: Union[str, list], @@ -4962,28 +4988,17 @@ def raw_generate( do_streaming: bool = False, do_dynamic_wi: bool = False, batch_count: int = 1, - decode: bool = True, -) -> List: + bypass_hf_maxlength: bool = False, +) -> GenerationResult: - if isinstance(prompt, str): - prompt_decoded = prompt - prompt_tokens = tokenizer.encode(prompt) - else: - prompt_decoded = tokenizer.decode(prompt) - prompt_tokens = prompt + prompt_tokens = tokenizer.encode(prompt) if isinstance(prompt, str) else prompt - # Some gen methods such as OAI don't return tokens. - batch_decoded = None - batch_encoded = None - if koboldai_vars.model == "Colab": raise NotImplementedError("Colab API raw_generate unsupported") elif koboldai_vars.model == "API": raise NotImplementedError("API raw_generate unsupported") elif koboldai_vars.model == "CLUSTER": raise NotImplementedError("Cluster raw_generate unsupported") - elif koboldai_vars.model == "OAI": - raise NotImplementedError("OpenAI raw_generate unsupported") elif koboldai_vars.model == "ReadOnly": raise NotImplementedError("No loaded model") @@ -4993,31 +5008,30 @@ def raw_generate( max_length=max_length, batch_count=batch_count ) + return GenerationResult( + out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=False + ) elif model == "OAI": - batch_decoded = ... - else: - batch_encoded = torch_raw_generate( + batch_encoded = oai_raw_generate( prompt_tokens=prompt_tokens, - max_new=max_length, - do_streaming=do_streaming, - do_dynamic_wi=do_dynamic_wi, + max_length=max_length, batch_count=batch_count ) - - assert batch_encoded is not None or batch_decoded is not None - - # Shave prompt off of encoded response. Decoded does not return prompt. - # TODO: Does MTJ generation shave this off automatically? Test it! - if batch_encoded is not None: - batch_encoded = batch_encoded[:, len(prompt_tokens) - 1:] + return GenerationResult( + out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=False + ) - if not decode: - return batch_encoded - - if batch_decoded is None: - batch_decoded = tokenizer.batch_decode(batch_encoded) - - return [utils.decodenewlines(x) for x in batch_decoded] + # Torch HF + batch_encoded = torch_raw_generate( + prompt_tokens=prompt_tokens, + max_new=max_length if not bypass_hf_maxlength else int(2e9), + do_streaming=do_streaming, + do_dynamic_wi=do_dynamic_wi, + batch_count=batch_count + ) + return GenerationResult( + out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True + ) def tpu_raw_generate( prompt_tokens: List[int], @@ -5090,6 +5104,84 @@ def torch_raw_generate( return genout +def oai_raw_generate( + prompt_tokens: List[int], + max_length: int, + batch_count: int, +): + # Taken mainly from oairequest() + + decoded_prompt = utils.decodenewlines(tokenizer.decode(prompt_tokens)) + + # Log request to console + if not koboldai_vars.quiet: + print("{0}Len:{1}, Txt:{2}{3}".format(colors.YELLOW, len(decoded_prompt), decoded_prompt, colors.END)) + + # Store context in memory to use it for comparison with generated content + koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + # GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround + # as the koboldai_vars.model will always be OAI + if 'GooseAI' in koboldai_vars.configname: + reqdata = { + 'prompt': decoded_prompt, + 'max_tokens': koboldai_vars.genamt, + 'temperature': koboldai_vars.temp, + 'top_a': koboldai_vars.top_a, + 'top_p': koboldai_vars.top_p, + 'top_k': koboldai_vars.top_k, + 'tfs': koboldai_vars.tfs, + 'typical_p': koboldai_vars.typical, + 'repetition_penalty': koboldai_vars.rep_pen, + 'repetition_penalty_slope': koboldai_vars.rep_pen_slope, + 'repetition_penalty_range': koboldai_vars.rep_pen_range, + 'n': koboldai_vars.numseqs, + # TODO: Implement streaming + 'stream': False + } + else: + reqdata = { + 'prompt': decoded_prompt, + 'max_tokens': koboldai_vars.genamt, + 'temperature': koboldai_vars.temp, + 'top_p': koboldai_vars.top_p, + 'n': koboldai_vars.numseqs, + 'stream': False + } + + req = requests.post( + koboldai_vars.oaiurl, + json = reqdata, + headers = { + 'Authorization': 'Bearer '+koboldai_vars.oaiapikey, + 'Content-Type': 'application/json' + } + ) + + # Deal with the response + if(req.status_code == 200): + outputs = [out["text"] for out in req.json()["choices"]] + + decoded_genout = [{"generated_text": utils.decodenewlines(txt)} + for txt in outputs] + + if not koboldai_vars.quiet: + print("{0}{1}{2}".format(colors.CYAN, decoded_genout, colors.END)) + + return [tokenizer.encode(x) for x in decoded_genout] + else: + # Send error message to web client + er = req.json() + if("error" in er): + type = er["error"]["type"] + message = er["error"]["message"] + + errmsg = "OpenAI API Error: {0} - {1}".format(type, message) + emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True, room="UI_1") + set_aibusy(0) + return [] + #==================================================================# # Send text to generator and deal with output #==================================================================# @@ -6323,102 +6415,6 @@ def ikrequest(txt): emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True, room="UI_1") set_aibusy(0) -#==================================================================# -# Assembles game data into a request to OpenAI API -#==================================================================# -def oairequest(txt, min, max): - # Log request to console - if not koboldai_vars.quiet: - print("{0}Len:{1}, Txt:{2}{3}".format(colors.YELLOW, len(txt), txt, colors.END)) - - # Store context in memory to use it for comparison with generated content - koboldai_vars.lastctx = txt - - # Build request JSON data - # GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround - # as the koboldai_vars.model will always be OAI - if 'GooseAI' in koboldai_vars.configname: - reqdata = { - 'prompt': txt, - 'max_tokens': koboldai_vars.genamt, - 'temperature': koboldai_vars.temp, - 'top_a': koboldai_vars.top_a, - 'top_p': koboldai_vars.top_p, - 'top_k': koboldai_vars.top_k, - 'tfs': koboldai_vars.tfs, - 'typical_p': koboldai_vars.typical, - 'repetition_penalty': koboldai_vars.rep_pen, - 'repetition_penalty_slope': koboldai_vars.rep_pen_slope, - 'repetition_penalty_range': koboldai_vars.rep_pen_range, - 'n': koboldai_vars.numseqs, - 'stream': False - } - else: - reqdata = { - 'prompt': txt, - 'max_tokens': koboldai_vars.genamt, - 'temperature': koboldai_vars.temp, - 'top_p': koboldai_vars.top_p, - 'n': koboldai_vars.numseqs, - 'stream': False - } - - req = requests.post( - koboldai_vars.oaiurl, - json = reqdata, - headers = { - 'Authorization': 'Bearer '+koboldai_vars.oaiapikey, - 'Content-Type': 'application/json' - } - ) - - # Deal with the response - if(req.status_code == 200): - outputs = [out["text"] for out in req.json()["choices"]] - - for idx in range(len(outputs)): - koboldai_vars.lua_koboldbridge.outputs[idx+1] = outputs[idx] - - execute_outmod() - if (koboldai_vars.lua_koboldbridge.regeneration_required): - koboldai_vars.lua_koboldbridge.regeneration_required = False - genout = [] - for i in range(len(outputs)): - genout.append( - {"generated_text": koboldai_vars.lua_koboldbridge.outputs[i + 1]}) - assert type(genout[-1]["generated_text"]) is str - else: - genout = [ - {"generated_text": utils.decodenewlines(txt)} - for txt in outputs] - - koboldai_vars.actions.append_options([applyoutputformatting(x["generated_text"]) for x in genout]) - genout = [{"generated_text": x['text']} for x in koboldai_vars.actions.get_current_options()] - if (len(genout) == 1): - genresult(genout[0]["generated_text"]) - else: - if (koboldai_vars.lua_koboldbridge.restart_sequence is not None and - koboldai_vars.lua_koboldbridge.restart_sequence > 0): - genresult(genout[koboldai_vars.lua_koboldbridge.restart_sequence - 1][ - "generated_text"]) - else: - genselect(genout) - - if not koboldai_vars.quiet: - print("{0}{1}{2}".format(colors.CYAN, genout, colors.END)) - - set_aibusy(0) - else: - # Send error message to web client - er = req.json() - if("error" in er): - type = er["error"]["type"] - message = er["error"]["message"] - - errmsg = "OpenAI API Error: {0} - {1}".format(type, message) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True, room="UI_1") - set_aibusy(0) - #==================================================================# # Forces UI to Play mode #==================================================================# From de6ea04e12de1104ffbf3a26a70b6c44852d05f4 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 19:14:35 -0500 Subject: [PATCH 13/31] Push it on raw_generate! --- aiserver.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/aiserver.py b/aiserver.py index 8bcdf608..7d498431 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4770,9 +4770,8 @@ def calcsubmit(txt): "Colab", "API", "CLUSTER", - "OAI", - "TPUMeshTransformerGPTJ", - "TPUMeshTransformerGPTNeoX" + # "TPUMeshTransformerGPTJ", + # "TPUMeshTransformerGPTNeoX" ): generate(subtxt, min, max, found_entries) elif koboldai_vars.model == "Colab": @@ -4781,10 +4780,8 @@ def calcsubmit(txt): sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) elif koboldai_vars.model == "CLUSTER": sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif koboldai_vars.model == "OAI": - oairequest(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): - tpumtjgenerate(subtxt, min, max, found_entries=found_entries) + # elif koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): + # tpumtjgenerate(subtxt, min, max, found_entries=found_entries) # For InferKit web API else: From 3ddf6170f65963903bfddec95cbc21a4cc05ef54 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 19:28:00 -0500 Subject: [PATCH 14/31] Finally some debugging --- aiserver.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index 7d498431..a9383a1b 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4839,6 +4839,9 @@ def calcsubmit(txt): # Send it! ikrequest(subtxt) +def __debug(text): + print(f"[DBG] {text}") + def core_generate(text: list, min: int, max: int, found_entries: set): # This generation function is tangled with koboldai_vars intentionally. It # is meant for the story and nothing else. @@ -4868,6 +4871,7 @@ def core_generate(text: list, min: int, max: int, found_entries: set): koboldai_vars._prompt = koboldai_vars.prompt + __debug("generate core", text) with torch.no_grad(): already_generated = 0 numseqs = koboldai_vars.numseqs @@ -4875,6 +4879,7 @@ def core_generate(text: list, min: int, max: int, found_entries: set): do_loop = True while do_loop: + __debug("generate loop start", text) # The reason this is a loop is due to how Dynamic WI works. We # cannot simply add the WI to the context mid-generation, so we # stop early, and then insert WI, then continue generating. That @@ -4890,7 +4895,9 @@ def core_generate(text: list, min: int, max: int, found_entries: set): bypass_hf_maxlength=True, ) + __debug("generate result", result.__dict__) do_loop = not result.is_whole_generation + __debug("loop is", do_loop) genout = result.encoded already_generated += len(genout[0]) - len(gen_in[0]) @@ -5006,7 +5013,7 @@ def raw_generate( batch_count=batch_count ) return GenerationResult( - out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=False + out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True ) elif model == "OAI": batch_encoded = oai_raw_generate( @@ -5015,7 +5022,7 @@ def raw_generate( batch_count=batch_count ) return GenerationResult( - out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=False + out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True ) # Torch HF @@ -5027,7 +5034,7 @@ def raw_generate( batch_count=batch_count ) return GenerationResult( - out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True + out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=False ) def tpu_raw_generate( From 5c546b323321a68fcab07a3b035fda6f611bd831 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 19:38:20 -0500 Subject: [PATCH 15/31] More debug --- aiserver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/aiserver.py b/aiserver.py index a9383a1b..78b3e0a8 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4782,6 +4782,8 @@ def calcsubmit(txt): sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) # elif koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): # tpumtjgenerate(subtxt, min, max, found_entries=found_entries) + else: + print(":(", koboldai_vars.model) # For InferKit web API else: @@ -5043,6 +5045,7 @@ def tpu_raw_generate( batch_count: int, ): # Mostly lifted from apiactionsubmit_tpumtjgenerate + print("we are generating") soft_tokens = tpumtjgetsofttokens() genout = tpool.execute( tpu_mtj_backend.infer_static, @@ -5191,6 +5194,7 @@ def oai_raw_generate( #==================================================================# def generate(txt, minimum, maximum, found_entries=None): + print("ring ring", txt, minimum, maximum, found_entries) koboldai_vars.generated_tkns = 0 if(found_entries is None): From ae90c39f7212c6c4d8900ce8105c4c6fba8e5fb4 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 19:47:21 -0500 Subject: [PATCH 16/31] yep fix --- aiserver.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/aiserver.py b/aiserver.py index 78b3e0a8..3e6220ab 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4766,24 +4766,24 @@ def calcsubmit(txt): else: subtxt, min, max = calcsubmitbudget(actionlen, winfo, mem, anotetxt, koboldai_vars.actions, submission=txt) - if not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ( - "Colab", - "API", - "CLUSTER", - # "TPUMeshTransformerGPTJ", - # "TPUMeshTransformerGPTNeoX" - ): - generate(subtxt, min, max, found_entries) - elif koboldai_vars.model == "Colab": - sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif koboldai_vars.model == "API": - sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - elif koboldai_vars.model == "CLUSTER": - sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) + # if koboldai_vars.model not in ( + # "Colab", + # "API", + # "CLUSTER", + # # "TPUMeshTransformerGPTJ", + # # "TPUMeshTransformerGPTNeoX" + # ): + generate(subtxt, min, max, found_entries) + # elif koboldai_vars.model == "Colab": + # sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) + # elif koboldai_vars.model == "API": + # sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) + # elif koboldai_vars.model == "CLUSTER": + # sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) # elif koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): # tpumtjgenerate(subtxt, min, max, found_entries=found_entries) - else: - print(":(", koboldai_vars.model) + # else: + # print(":(", koboldai_vars.model) # For InferKit web API else: From d9d24902aef2480aa6c4dd2d58d0d5a77b48db73 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 20:10:23 -0500 Subject: [PATCH 17/31] Work on mtj --- aiserver.py | 24 +++++++++++++----------- koboldai_settings.py | 11 ++++++++++- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/aiserver.py b/aiserver.py index 3e6220ab..95d92bec 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4841,23 +4841,25 @@ def calcsubmit(txt): # Send it! ikrequest(subtxt) -def __debug(text): - print(f"[DBG] {text}") +def __debug(*args): + print("[DBG] ", *args) def core_generate(text: list, min: int, max: int, found_entries: set): # This generation function is tangled with koboldai_vars intentionally. It # is meant for the story and nothing else. - if koboldai_vars.full_determinism: - torch.manual_seed(koboldai_vars.seed) + if koboldai_vars.is_model_torch(): + # Torch stuff + if koboldai_vars.full_determinism: + torch.manual_seed(koboldai_vars.seed) - gen_in = torch.tensor(text, dtype=torch.long)[None] - if koboldai_vars.sp is not None: - soft_tokens = torch.arange( - model.config.vocab_size, - model.config.vocab_size + koboldai_vars.sp.shape[0], - ) - gen_in = torch.cat((soft_tokens[None], gen_in), dim=-1) + gen_in = torch.tensor(text, dtype=torch.long)[None] + if koboldai_vars.sp is not None: + soft_tokens = torch.arange( + model.config.vocab_size, + model.config.vocab_size + koboldai_vars.sp.shape[0], + ) + gen_in = torch.cat((soft_tokens[None], gen_in), dim=-1) assert gen_in.shape[-1] + koboldai_vars.genamt <= koboldai_vars.max_length diff --git a/koboldai_settings.py b/koboldai_settings.py index 9aa539e3..2279b532 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -287,6 +287,15 @@ class koboldai_vars(object): self.context = context return tokens, used_tokens, used_tokens+self.genamt + + def is_model_torch(self) -> bool: + if self.use_colab_tpu: + return False + + if self.model in ["Colab", "API", "CLUSTER", "ReadOnly", "OAI"]: + return False + + return True def __setattr__(self, name, value): if name[0] == "_" or name == "tokenizer": @@ -443,7 +452,7 @@ class model_settings(settings): self.uid_presets = [] self.default_preset = {} self.cluster_requested_models = [] # The models which we allow to generate during cluster mode - + #dummy class to eat the tqdm output class ignore_tqdm(object): From 39f7dec9accbbdcddf5a68406710ab12b95bacee Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 20:18:06 -0500 Subject: [PATCH 18/31] Oops --- aiserver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 95d92bec..3f12407b 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4848,12 +4848,13 @@ def core_generate(text: list, min: int, max: int, found_entries: set): # This generation function is tangled with koboldai_vars intentionally. It # is meant for the story and nothing else. + gen_in = torch.tensor(text, dtype=torch.long)[None] + if koboldai_vars.is_model_torch(): # Torch stuff if koboldai_vars.full_determinism: torch.manual_seed(koboldai_vars.seed) - gen_in = torch.tensor(text, dtype=torch.long)[None] if koboldai_vars.sp is not None: soft_tokens = torch.arange( model.config.vocab_size, From 9d708611187c7e1eb0c333a742775c376d571512 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 20:23:31 -0500 Subject: [PATCH 19/31] TPU Fixes --- aiserver.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/aiserver.py b/aiserver.py index 3f12407b..8acacc07 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4872,7 +4872,9 @@ def core_generate(text: list, min: int, max: int, found_entries: set): gen_in = gen_in.to("cpu") found_entries = found_entries or set() - model.kai_scanner_excluded_world_info = found_entries + + if model: + model.kai_scanner_excluded_world_info = found_entries koboldai_vars._prompt = koboldai_vars.prompt @@ -4881,9 +4883,7 @@ def core_generate(text: list, min: int, max: int, found_entries: set): already_generated = 0 numseqs = koboldai_vars.numseqs - do_loop = True - - while do_loop: + while True: __debug("generate loop start", text) # The reason this is a loop is due to how Dynamic WI works. We # cannot simply add the WI to the context mid-generation, so we @@ -4901,8 +4901,11 @@ def core_generate(text: list, min: int, max: int, found_entries: set): ) __debug("generate result", result.__dict__) - do_loop = not result.is_whole_generation - __debug("loop is", do_loop) + + if result.is_whole_generation: + __debug("Outa here") + break + genout = result.encoded already_generated += len(genout[0]) - len(gen_in[0]) From 5e9f3b3c0ebd4b32c02115ad074e27a54b360d96 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 20:30:26 -0500 Subject: [PATCH 20/31] XtraDBug --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 8acacc07..6ba95899 100644 --- a/aiserver.py +++ b/aiserver.py @@ -5051,7 +5051,7 @@ def tpu_raw_generate( batch_count: int, ): # Mostly lifted from apiactionsubmit_tpumtjgenerate - print("we are generating") + print("we are generating with", prompt_tokens, "batch", batch_count) soft_tokens = tpumtjgetsofttokens() genout = tpool.execute( tpu_mtj_backend.infer_static, From c664a0ae1e4a386b0d861e095eb6376d2e667f1e Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 20:43:10 -0500 Subject: [PATCH 21/31] Figure out what is going on --- aiserver.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/aiserver.py b/aiserver.py index 6ba95899..7d9d8a17 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4442,7 +4442,7 @@ def apiactionsubmit_generate(txt, minimum, maximum): torch.cuda.empty_cache() # Submit input text to generator - _genout, already_generated = tpool.execute(_generate, txt, minimum, maximum, set()) + _genout, already_generated = tpool.execute(core_generate, txt, minimum, maximum, set()) genout = [applyoutputformatting(utils.decodenewlines(tokenizer.decode(tokens[-already_generated:]))) for tokens in _genout] @@ -4861,6 +4861,9 @@ def core_generate(text: list, min: int, max: int, found_entries: set): model.config.vocab_size + koboldai_vars.sp.shape[0], ) gen_in = torch.cat((soft_tokens[None], gen_in), dim=-1) + elif koboldai_vars.use_colab_tpu: + if koboldai_vars.full_determinism: + tpu_mtj_backend.set_rng_seed(koboldai_vars.seed) assert gen_in.shape[-1] + koboldai_vars.genamt <= koboldai_vars.max_length @@ -5051,8 +5054,8 @@ def tpu_raw_generate( batch_count: int, ): # Mostly lifted from apiactionsubmit_tpumtjgenerate - print("we are generating with", prompt_tokens, "batch", batch_count) soft_tokens = tpumtjgetsofttokens() + __debug("we are generating with", prompt_tokens, "batch", batch_count, "soft tokens", soft_tokens) genout = tpool.execute( tpu_mtj_backend.infer_static, np.uint32(prompt_tokens), From 1d32b090a9388feedc357d702d19b93deea54e01 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 21:08:36 -0500 Subject: [PATCH 22/31] MTJ Gen Work --- aiserver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aiserver.py b/aiserver.py index 7d9d8a17..e79c0272 100644 --- a/aiserver.py +++ b/aiserver.py @@ -5053,6 +5053,8 @@ def tpu_raw_generate( max_length: int, batch_count: int, ): + + prompt_tokens = prompt_tokens[0] # Mostly lifted from apiactionsubmit_tpumtjgenerate soft_tokens = tpumtjgetsofttokens() __debug("we are generating with", prompt_tokens, "batch", batch_count, "soft tokens", soft_tokens) From 92e240a6ba45e9ba3169be74ccde5ecb838f82db Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 21:31:56 -0500 Subject: [PATCH 23/31] MTJ Fix --- aiserver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aiserver.py b/aiserver.py index e79c0272..91600cb9 100644 --- a/aiserver.py +++ b/aiserver.py @@ -5077,6 +5077,8 @@ def tpu_raw_generate( sampler_order=koboldai_vars.sampler_order, ) + genout = np.array(genout) + return genout def torch_raw_generate( From ffbe50920e80dabee85cc94924eeb0df9759ea40 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 21:34:54 -0500 Subject: [PATCH 24/31] MTJ Fix --- aiserver.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/aiserver.py b/aiserver.py index 91600cb9..cafc5f9f 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4905,15 +4905,15 @@ def core_generate(text: list, min: int, max: int, found_entries: set): __debug("generate result", result.__dict__) - if result.is_whole_generation: - __debug("Outa here") - break - genout = result.encoded already_generated += len(genout[0]) - len(gen_in[0]) assert already_generated <= koboldai_vars.genamt + if result.is_whole_generation: + __debug("Outa here") + break + # Generation stopped; why? # If we have been told to halt, we have reached our target token # amount (controlled by halt), or Dynamic WI has not told us to @@ -5058,6 +5058,7 @@ def tpu_raw_generate( # Mostly lifted from apiactionsubmit_tpumtjgenerate soft_tokens = tpumtjgetsofttokens() __debug("we are generating with", prompt_tokens, "batch", batch_count, "soft tokens", soft_tokens) + genout = tpool.execute( tpu_mtj_backend.infer_static, np.uint32(prompt_tokens), From ca356d4d6fc3c4ac171d6beba84d3dad76bfe3c2 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 21:57:01 -0500 Subject: [PATCH 25/31] MTJ Fix for trim --- aiserver.py | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/aiserver.py b/aiserver.py index cafc5f9f..90575eac 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4766,24 +4766,7 @@ def calcsubmit(txt): else: subtxt, min, max = calcsubmitbudget(actionlen, winfo, mem, anotetxt, koboldai_vars.actions, submission=txt) - # if koboldai_vars.model not in ( - # "Colab", - # "API", - # "CLUSTER", - # # "TPUMeshTransformerGPTJ", - # # "TPUMeshTransformerGPTNeoX" - # ): generate(subtxt, min, max, found_entries) - # elif koboldai_vars.model == "Colab": - # sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - # elif koboldai_vars.model == "API": - # sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - # elif koboldai_vars.model == "CLUSTER": - # sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) - # elif koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): - # tpumtjgenerate(subtxt, min, max, found_entries=found_entries) - # else: - # print(":(", koboldai_vars.model) # For InferKit web API else: @@ -4984,12 +4967,20 @@ class GenerationResult: # Controls if generate() does it's looping thing. This should only be # done for HF models that use that StoppingCondition - is_whole_generation: bool + is_whole_generation: bool, + + # Controls if we should trim output by prompt length + output_includes_prompt: bool = False, ): # Shave prompt off of encoded response. Decoded does not return prompt. # TODO: Does MTJ generation shave this off automatically? Test it! - print("shape", out_batches.shape) - self.encoded = out_batches[:, len(prompt) - 1:] + __debug("shape", out_batches.shape) + + if output_includes_prompt: + self.encoded = out_batches[:, len(prompt) - 1:] + else: + self.encoded = out_batches + self.prompt = prompt self.is_whole_generation = is_whole_generation @@ -5042,10 +5033,13 @@ def raw_generate( max_new=max_length if not bypass_hf_maxlength else int(2e9), do_streaming=do_streaming, do_dynamic_wi=do_dynamic_wi, - batch_count=batch_count + batch_count=batch_count, ) return GenerationResult( - out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=False + out_batches=batch_encoded, + prompt=prompt_tokens, + is_whole_generation=False, + output_includes_prompt=True, ) def tpu_raw_generate( From 0b4ce13eb86e8fddbfc725f008ab96343db6245f Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 22 Sep 2022 22:05:48 -0500 Subject: [PATCH 26/31] Evil --- aiserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiserver.py b/aiserver.py index 90575eac..744a0058 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4974,7 +4974,7 @@ class GenerationResult: ): # Shave prompt off of encoded response. Decoded does not return prompt. # TODO: Does MTJ generation shave this off automatically? Test it! - __debug("shape", out_batches.shape) + print("shape", out_batches.shape) if output_includes_prompt: self.encoded = out_batches[:, len(prompt) - 1:] From 5c374dff316247890c72cf38959ec3267db157bd Mon Sep 17 00:00:00 2001 From: somebody Date: Fri, 23 Sep 2022 19:51:40 -0500 Subject: [PATCH 27/31] Finishing up OAI/Goose --- aiserver.py | 98 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 40 deletions(-) diff --git a/aiserver.py b/aiserver.py index 744a0058..7c349c5c 100644 --- a/aiserver.py +++ b/aiserver.py @@ -43,6 +43,7 @@ import inspect import warnings import multiprocessing import copy +import numpy as np from collections.abc import Iterable from collections import OrderedDict from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type @@ -1141,10 +1142,6 @@ def spRequest(filename): koboldai_vars.sp_changed = True return - global np - if 'np' not in globals(): - import numpy as np - z, version, shape, fortran_order, dtype = fileops.checksp("./softprompts/"+filename, koboldai_vars.modeldim) if not isinstance(z, zipfile.ZipFile): raise RuntimeError(f"{repr(filename)} is not a valid soft prompt file") @@ -1342,9 +1339,6 @@ def general_startup(override_args=None): def tpumtjgetsofttokens(): soft_tokens = None if(koboldai_vars.sp is None): - global np - if 'np' not in globals(): - import numpy as np tensor = np.zeros((1, tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"])), dtype=np.float32) rows = tensor.shape[0] padding_amount = tpu_mtj_backend.params["seq"] - (tpu_mtj_backend.params["seq"] % -tpu_mtj_backend.params["cores_per_replica"]) - rows @@ -1406,14 +1400,19 @@ def get_model_info(model, directory=""): if path.exists("settings/{}.v2_settings".format(model)): with open("settings/{}.v2_settings".format(model), "r") as file: # Check if API key exists - js = json.load(file) - if("apikey" in js and js["apikey"] != ""): - # API key exists, grab it and close the file - key_value = js["apikey"] - elif 'oaiapikey' in js and js['oaiapikey'] != "": - key_value = js["oaiapikey"] - if model in ('GooseAI', 'OAI'): - get_oai_models({'model': model, 'key': key_value}) + try: + js = json.load(file) + + if("apikey" in js and js["apikey"] != ""): + # API key exists, grab it and close the file + key_value = js["apikey"] + elif 'oaiapikey' in js and js['oaiapikey'] != "": + key_value = js["oaiapikey"] + if model in ('GooseAI', 'OAI'): + get_oai_models({'model': model, 'key': key_value}) + except json.decoder.JSONDecodeError: + print(":(") + pass key = True elif model == 'ReadOnly': pass @@ -1500,7 +1499,8 @@ def get_oai_models(data): } ) if(req.status_code == 200): - engines = req.json()["data"] + r = req.json() + engines = r["data"] try: engines = [[en["id"], "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")] for en in engines] except: @@ -1524,7 +1524,9 @@ def get_oai_models(data): if js['apikey'] != key: changed=True else: + js = {} changed=True + if changed: with open("settings/{}.v2_settings".format(model), "w") as file: js["apikey"] = key @@ -4877,7 +4879,7 @@ def core_generate(text: list, min: int, max: int, found_entries: set): # stopping and continuing is this loop. result = raw_generate( - gen_in, + gen_in[0], max_length=koboldai_vars.genamt, do_streaming=True, do_dynamic_wi=True, @@ -4890,7 +4892,7 @@ def core_generate(text: list, min: int, max: int, found_entries: set): genout = result.encoded - already_generated += len(genout[0]) - len(gen_in[0]) + already_generated += len(genout[0]) # - len(gen_in[0]) assert already_generated <= koboldai_vars.genamt if result.is_whole_generation: @@ -4951,12 +4953,13 @@ def core_generate(text: list, min: int, max: int, found_entries: set): ) genout = torch.cat((soft_tokens.tile(koboldai_vars.numseqs, 1), genout), dim=-1) assert genout.shape[-1] + koboldai_vars.genamt - already_generated <= koboldai_vars.max_length - diff = genout.shape[-1] - gen_in.shape[-1] - minimum += diff - maximum += diff + # diff = genout.shape[-1] - gen_in.shape[-1] + # minimum += diff + # maximum += diff gen_in = genout numseqs = 1 + __debug("final out", genout, "already_gen", already_generated) return genout, already_generated class GenerationResult: @@ -4988,7 +4991,7 @@ class GenerationResult: def raw_generate( # prompt is either a string (text) or a list (token ids) - prompt: Union[str, list], + prompt: Union[str, list, np.ndarray], max_length: int, do_streaming: bool = False, @@ -4997,7 +5000,18 @@ def raw_generate( bypass_hf_maxlength: bool = False, ) -> GenerationResult: - prompt_tokens = tokenizer.encode(prompt) if isinstance(prompt, str) else prompt + if isinstance(prompt, torch.Tensor): + prompt_tokens = prompt.cpu().numpy() + elif isinstance(prompt, list): + prompt_tokens = np.array(prompt) + elif isinstance(prompt, str): + prompt_tokens = tokenizer.encode(prompt) + else: + raise ValueError(f"Prompt is {type(prompt)}. Not a fan!") + + assert isinstance(prompt_tokens, np.ndarray) + assert len(prompt_tokens.shape) == 1 + if koboldai_vars.model == "Colab": raise NotImplementedError("Colab API raw_generate unsupported") @@ -5008,7 +5022,7 @@ def raw_generate( elif koboldai_vars.model == "ReadOnly": raise NotImplementedError("No loaded model") - if koboldai_vars.use_colab_tpu or model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): + if koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): batch_encoded = tpu_raw_generate( prompt_tokens=prompt_tokens, max_length=max_length, @@ -5017,7 +5031,8 @@ def raw_generate( return GenerationResult( out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True ) - elif model == "OAI": + elif koboldai_vars.model in ["GooseAI", "OAI"]: + print("kiss") batch_encoded = oai_raw_generate( prompt_tokens=prompt_tokens, max_length=max_length, @@ -5026,6 +5041,8 @@ def raw_generate( return GenerationResult( out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True ) + + print("model", model) # Torch HF batch_encoded = torch_raw_generate( @@ -5048,7 +5065,6 @@ def tpu_raw_generate( batch_count: int, ): - prompt_tokens = prompt_tokens[0] # Mostly lifted from apiactionsubmit_tpumtjgenerate soft_tokens = tpumtjgetsofttokens() __debug("we are generating with", prompt_tokens, "batch", batch_count, "soft tokens", soft_tokens) @@ -5174,26 +5190,28 @@ def oai_raw_generate( } ) + j = req.json() # Deal with the response - if(req.status_code == 200): - outputs = [out["text"] for out in req.json()["choices"]] - - decoded_genout = [{"generated_text": utils.decodenewlines(txt)} - for txt in outputs] + if req.ok: + outputs = [out["text"] for out in j["choices"]] if not koboldai_vars.quiet: - print("{0}{1}{2}".format(colors.CYAN, decoded_genout, colors.END)) + print("{0}{1}{2}".format(colors.CYAN, outputs, colors.END)) - return [tokenizer.encode(x) for x in decoded_genout] + return np.array([tokenizer.encode(x) for x in outputs]) else: - # Send error message to web client - er = req.json() - if("error" in er): - type = er["error"]["type"] - message = er["error"]["message"] + # Send error message to web client + if "error" in j: + error_type = j["error"]["type"] + error_message = j["error"]["message"] + else: + error_type = "Unknown" + error_message = "Unknown" - errmsg = "OpenAI API Error: {0} - {1}".format(type, message) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True, room="UI_1") + emit('from_server', { + 'cmd': 'errmsg', + 'data': f"OpenAI API Error: {error_type} - {error_message}" + }, broadcast=True, room="UI_1") set_aibusy(0) return [] From 3a727bc381d5311a38e00ce7b1db487dd70509ad Mon Sep 17 00:00:00 2001 From: somebody Date: Fri, 23 Sep 2022 21:47:24 -0500 Subject: [PATCH 28/31] Fixes and fixes --- aiserver.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/aiserver.py b/aiserver.py index 7c349c5c..e3d2897d 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4892,7 +4892,7 @@ def core_generate(text: list, min: int, max: int, found_entries: set): genout = result.encoded - already_generated += len(genout[0]) # - len(gen_in[0]) + already_generated += len(genout[0]) - 1 # - len(gen_in[0]) assert already_generated <= koboldai_vars.genamt if result.is_whole_generation: @@ -5041,8 +5041,6 @@ def raw_generate( return GenerationResult( out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True ) - - print("model", model) # Torch HF batch_encoded = torch_raw_generate( From 5cdeb79752c40b878c9c1b38a13f6cdb35901a30 Mon Sep 17 00:00:00 2001 From: somebody Date: Sat, 24 Sep 2022 12:54:20 -0500 Subject: [PATCH 29/31] Final touches --- aiserver.py | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/aiserver.py b/aiserver.py index e3d2897d..628a0d17 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4826,9 +4826,6 @@ def calcsubmit(txt): # Send it! ikrequest(subtxt) -def __debug(*args): - print("[DBG] ", *args) - def core_generate(text: list, min: int, max: int, found_entries: set): # This generation function is tangled with koboldai_vars intentionally. It # is meant for the story and nothing else. @@ -4866,13 +4863,11 @@ def core_generate(text: list, min: int, max: int, found_entries: set): koboldai_vars._prompt = koboldai_vars.prompt - __debug("generate core", text) with torch.no_grad(): already_generated = 0 numseqs = koboldai_vars.numseqs while True: - __debug("generate loop start", text) # The reason this is a loop is due to how Dynamic WI works. We # cannot simply add the WI to the context mid-generation, so we # stop early, and then insert WI, then continue generating. That @@ -4888,15 +4883,12 @@ def core_generate(text: list, min: int, max: int, found_entries: set): bypass_hf_maxlength=True, ) - __debug("generate result", result.__dict__) - genout = result.encoded - already_generated += len(genout[0]) - 1 # - len(gen_in[0]) + already_generated += len(genout[0]) - 1 assert already_generated <= koboldai_vars.genamt if result.is_whole_generation: - __debug("Outa here") break # Generation stopped; why? @@ -4953,13 +4945,9 @@ def core_generate(text: list, min: int, max: int, found_entries: set): ) genout = torch.cat((soft_tokens.tile(koboldai_vars.numseqs, 1), genout), dim=-1) assert genout.shape[-1] + koboldai_vars.genamt - already_generated <= koboldai_vars.max_length - # diff = genout.shape[-1] - gen_in.shape[-1] - # minimum += diff - # maximum += diff gen_in = genout numseqs = 1 - __debug("final out", genout, "already_gen", already_generated) return genout, already_generated class GenerationResult: @@ -4975,10 +4963,8 @@ class GenerationResult: # Controls if we should trim output by prompt length output_includes_prompt: bool = False, ): - # Shave prompt off of encoded response. Decoded does not return prompt. - # TODO: Does MTJ generation shave this off automatically? Test it! - print("shape", out_batches.shape) - + # Shave prompt off of encoded response when needed (HF). Decoded does + # not return prompt. if output_includes_prompt: self.encoded = out_batches[:, len(prompt) - 1:] else: @@ -5065,7 +5051,6 @@ def tpu_raw_generate( # Mostly lifted from apiactionsubmit_tpumtjgenerate soft_tokens = tpumtjgetsofttokens() - __debug("we are generating with", prompt_tokens, "batch", batch_count, "soft tokens", soft_tokens) genout = tpool.execute( tpu_mtj_backend.infer_static, From ba3101a2881d913f9a02e8f245d3954abe8de061 Mon Sep 17 00:00:00 2001 From: somebody Date: Sat, 24 Sep 2022 15:47:12 -0500 Subject: [PATCH 30/31] Actual final touches --- aiserver.py | 558 +++++++++++++++++++++++----------------------------- 1 file changed, 245 insertions(+), 313 deletions(-) diff --git a/aiserver.py b/aiserver.py index 628a0d17..b81c3ed7 100644 --- a/aiserver.py +++ b/aiserver.py @@ -4875,7 +4875,7 @@ def core_generate(text: list, min: int, max: int, found_entries: set): result = raw_generate( gen_in[0], - max_length=koboldai_vars.genamt, + max_new=koboldai_vars.genamt, do_streaming=True, do_dynamic_wi=True, batch_count=numseqs, @@ -4975,17 +4975,49 @@ class GenerationResult: self.decoded = [utils.decodenewlines(tokenizer.decode(enc)) for enc in self.encoded] +class GenerationSettings: + def __init__(self, **overrides) -> None: + for setting in [ + "temp", + "top_p", + "top_k", + "tfs", + "typical", + "top_a", + "rep_pen", + "rep_pen_slope", + "rep_pen_range", + "sampler_order", + ]: + setattr( + self, + setting, + overrides.get(setting, getattr(koboldai_vars, setting)) + ) + + def raw_generate( # prompt is either a string (text) or a list (token ids) prompt: Union[str, list, np.ndarray], - max_length: int, + max_new: int, do_streaming: bool = False, do_dynamic_wi: bool = False, batch_count: int = 1, bypass_hf_maxlength: bool = False, + generation_settings: Optional[dict] = None ) -> GenerationResult: + gen_settings = GenerationSettings(*(generation_settings or {})) + + model_functions = { + "GooseAI": oai_raw_generate, + "OAI": oai_raw_generate, + "CLUSTER": cluster_raw_generate, + "Colab": colab_raw_generate, + "API": api_raw_generate, + } + if isinstance(prompt, torch.Tensor): prompt_tokens = prompt.cpu().numpy() elif isinstance(prompt, list): @@ -4998,31 +5030,25 @@ def raw_generate( assert isinstance(prompt_tokens, np.ndarray) assert len(prompt_tokens.shape) == 1 - - if koboldai_vars.model == "Colab": - raise NotImplementedError("Colab API raw_generate unsupported") - elif koboldai_vars.model == "API": - raise NotImplementedError("API raw_generate unsupported") - elif koboldai_vars.model == "CLUSTER": - raise NotImplementedError("Cluster raw_generate unsupported") - elif koboldai_vars.model == "ReadOnly": + if koboldai_vars.model == "ReadOnly": raise NotImplementedError("No loaded model") if koboldai_vars.use_colab_tpu or koboldai_vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"): batch_encoded = tpu_raw_generate( prompt_tokens=prompt_tokens, - max_length=max_length, - batch_count=batch_count + max_new=max_new, + batch_count=batch_count, + gen_settings=gen_settings ) return GenerationResult( out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True ) - elif koboldai_vars.model in ["GooseAI", "OAI"]: - print("kiss") - batch_encoded = oai_raw_generate( + elif koboldai_vars.model in model_functions: + model_functions[koboldai_vars.model]( prompt_tokens=prompt_tokens, - max_length=max_length, - batch_count=batch_count + max_new=max_new, + batch_count=batch_count, + gen_settings=gen_settings ) return GenerationResult( out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True @@ -5031,10 +5057,11 @@ def raw_generate( # Torch HF batch_encoded = torch_raw_generate( prompt_tokens=prompt_tokens, - max_new=max_length if not bypass_hf_maxlength else int(2e9), + max_new=max_new if not bypass_hf_maxlength else int(2e9), do_streaming=do_streaming, do_dynamic_wi=do_dynamic_wi, batch_count=batch_count, + gen_settings=gen_settings ) return GenerationResult( out_batches=batch_encoded, @@ -5045,8 +5072,9 @@ def raw_generate( def tpu_raw_generate( prompt_tokens: List[int], - max_length: int, + max_new: int, batch_count: int, + gen_settings: GenerationSettings ): # Mostly lifted from apiactionsubmit_tpumtjgenerate @@ -5055,20 +5083,20 @@ def tpu_raw_generate( genout = tpool.execute( tpu_mtj_backend.infer_static, np.uint32(prompt_tokens), - gen_len = max_length, - temp=koboldai_vars.temp, - top_p=koboldai_vars.top_p, - top_k=koboldai_vars.top_k, - tfs=koboldai_vars.tfs, - typical=koboldai_vars.typical, - top_a=koboldai_vars.top_a, + gen_len = max_new, + temp=gen_settings.temp, + top_p=gen_settings.top_p, + top_k=gen_settings.top_k, + tfs=gen_settings.tfs, + typical=gen_settings.typical, + top_a=gen_settings.top_a, numseqs=batch_count, - repetition_penalty=koboldai_vars.rep_pen, - rpslope=koboldai_vars.rep_pen_slope, - rprange=koboldai_vars.rep_pen_range, + repetition_penalty=gen_settings.rep_pen, + rpslope=gen_settings.rep_pen_slope, + rprange=gen_settings.rep_pen_range, soft_embeddings=koboldai_vars.sp, soft_tokens=soft_tokens, - sampler_order=koboldai_vars.sampler_order, + sampler_order=gen_settings.sampler_order, ) genout = np.array(genout) @@ -5078,10 +5106,11 @@ def tpu_raw_generate( def torch_raw_generate( prompt_tokens: Union[List[int], torch.Tensor], max_new: int, + gen_settings: GenerationSettings, do_streaming: bool = False, do_dynamic_wi: bool = False, - batch_count: int = 1 + batch_count: int = 1, ): koboldai_vars.inference_config.do_streaming = do_streaming @@ -5120,8 +5149,9 @@ def torch_raw_generate( def oai_raw_generate( prompt_tokens: List[int], - max_length: int, + max_new: int, batch_count: int, + gen_settings: GenerationSettings, ): # Taken mainly from oairequest() @@ -5140,27 +5170,27 @@ def oai_raw_generate( if 'GooseAI' in koboldai_vars.configname: reqdata = { 'prompt': decoded_prompt, - 'max_tokens': koboldai_vars.genamt, - 'temperature': koboldai_vars.temp, - 'top_a': koboldai_vars.top_a, - 'top_p': koboldai_vars.top_p, - 'top_k': koboldai_vars.top_k, - 'tfs': koboldai_vars.tfs, - 'typical_p': koboldai_vars.typical, - 'repetition_penalty': koboldai_vars.rep_pen, - 'repetition_penalty_slope': koboldai_vars.rep_pen_slope, - 'repetition_penalty_range': koboldai_vars.rep_pen_range, - 'n': koboldai_vars.numseqs, + 'max_tokens': max_new, + 'temperature': gen_settings.temp, + 'top_a': gen_settings.top_a, + 'top_p': gen_settings.top_p, + 'top_k': gen_settings.top_k, + 'tfs': gen_settings.tfs, + 'typical_p': gen_settings.typical, + 'repetition_penalty': gen_settings.rep_pen, + 'repetition_penalty_slope': gen_settings.rep_pen_slope, + 'repetition_penalty_range': gen_settings.rep_pen_range, + 'n': batch_count, # TODO: Implement streaming 'stream': False } else: reqdata = { 'prompt': decoded_prompt, - 'max_tokens': koboldai_vars.genamt, - 'temperature': koboldai_vars.temp, - 'top_p': koboldai_vars.top_p, - 'n': koboldai_vars.numseqs, + 'max_tokens': max_new, + 'temperature': gen_settings.temp, + 'top_p': gen_settings.top_p, + 'n': batch_count, 'stream': False } @@ -5198,6 +5228,173 @@ def oai_raw_generate( set_aibusy(0) return [] +class HordeException(Exception): + pass + +def cluster_raw_generate( + prompt_tokens: List[int], + max_new: int, + batch_count: int, + gen_settings: GenerationSettings, +): + decoded_prompt = utils.decodenewlines(tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + reqdata = { + 'max_length': max_new, + 'max_context_length': koboldai_vars.max_length, + 'rep_pen': gen_settings.rep_pen, + 'rep_pen_slope': gen_settings.rep_pen_slope, + 'rep_pen_range': gen_settings.rep_pen_range, + 'temperature': gen_settings.temp, + 'top_p': gen_settings.top_p, + 'top_k': gen_settings.top_k, + 'top_a': gen_settings.top_a, + 'tfs': gen_settings.tfs, + 'typical': gen_settings.typical, + 'n': batch_count, + } + + cluster_metadata = { + 'prompt': decoded_prompt, + 'params': reqdata, + 'api_key': koboldai_vars.apikey, + 'models': koboldai_vars.cluster_requested_models, + } + + try: + # Create request + req = requests.post( + koboldai_vars.colaburl[:-8] + "/api/v1/generate/sync", + json=cluster_metadata, + ) + js = req.json() + except requests.exceptions.ConnectionError: + errmsg = f"Horde unavailable. Please try again later" + print("{0}{1}{2}".format(colors.RED, errmsg, colors.END)) + raise HordeException(errmsg) + except requests.exceptions.JSONDecodeError: + errmsg = f"Unexpected message received from the Horde: '{req.text}'" + print("{0}{1}{2}".format(colors.RED, errmsg, colors.END)) + raise HordeException(errmsg) + if(req.status_code == 503): + errmsg = f"KoboldAI API Error: No available KoboldAI servers found in Horde to fulfil this request using the selected models or other properties." + print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) + raise HordeException(errmsg) + if(req.status_code != 200): + errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." + print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) + raise HordeException(errmsg) + gen_servers = [(cgen['server_name'],cgen['server_id']) for cgen in js] + print(f"{colors.GREEN}Generations by: {gen_servers}{colors.END}") + + # TODO: Fix this. Request context issues!! + # Just in case we want to announce it to the user + # if len(js) == 1: + # warnmsg = f"Text generated by {js[0]['server_name']}" + # emit('from_server', {'cmd': 'warnmsg', 'data': warnmsg}, broadcast=True) + return np.array([tokenizer.encode(cgen["text"]) for cgen in js]) + +def colab_raw_generate( + prompt_tokens: List[int], + max_new: int, + batch_count: int, + gen_settings: GenerationSettings, +): + decoded_prompt = utils.decodenewlines(tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + reqdata = { + 'text': decoded_prompt, + 'min': 0, + 'max': max_new, + 'rep_pen': gen_settings.rep_pen, + 'rep_pen_slope': gen_settings.rep_pen_slope, + 'rep_pen_range': gen_settings.rep_pen_range, + 'temperature': gen_settings.temp, + 'top_p': gen_settings.top_p, + 'top_k': gen_settings.top_k, + 'tfs': gen_settings.tfs, + 'typical': gen_settings.typical, + 'topa': gen_settings.top_a, + 'numseqs': batch_count, + 'retfultxt': False + } + + # Create request + req = requests.post( + koboldai_vars.colaburl, + json = reqdata + ) + + # Deal with the response + if(req.status_code == 200): + js = req.json()["data"] + + # Try to be backwards compatible with outdated colab + if("text" in js): + genout = [getnewcontent(js["text"])] + else: + genout = js["seqs"] + + return np.array([tokenizer.encode(x) for x in genout]) + +def api_raw_generate( + prompt_tokens: List[int], + max_new: int, + batch_count: int, + gen_settings: GenerationSettings, +): + decoded_prompt = utils.decodenewlines(tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + reqdata = { + 'prompt': decoded_prompt, + 'max_length': max_new, + 'max_context_length': gen_settings.max_length, + 'rep_pen': gen_settings.rep_pen, + 'rep_pen_slope': gen_settings.rep_pen_slope, + 'rep_pen_range': gen_settings.rep_pen_range, + 'temperature': gen_settings.temp, + 'top_p': gen_settings.top_p, + 'top_k': gen_settings.top_k, + 'top_a': gen_settings.top_a, + 'tfs': gen_settings.tfs, + 'typical': gen_settings.typical, + 'n': batch_count, + } + + # Create request + while True: + req = requests.post( + koboldai_vars.colaburl[:-8] + "/api/v1/generate", + json=reqdata, + ) + if(req.status_code == 503): # Server is currently generating something else so poll until it's our turn + time.sleep(1) + continue + js = req.json() + if(req.status_code != 200): + errmsg = "KoboldAI API Error: Failed to get a reply from the server. Please check the console." + print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) + emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) + emit("error", errmsg, broadcast=True, room="UI_2") + set_aibusy(0) + return + + genout = [obj["text"] for obj in js["results"]] + return np.array([tokenizer.encode(x) for x in genout]) + + #==================================================================# # Send text to generator and deal with output #==================================================================# @@ -5351,271 +5548,6 @@ def pinsequence(n): send_debug() -#==================================================================# -# Send transformers-style request to ngrok/colab host -#==================================================================# -def sendtocolab(txt, min, max): - # Log request to console - if not koboldai_vars.quiet: - print("{0}Tokens:{1}, Txt:{2}{3}".format(colors.YELLOW, min-1, txt, colors.END)) - - # Store context in memory to use it for comparison with generated content - koboldai_vars.lastctx = txt - - # Build request JSON data - reqdata = { - 'text': txt, - 'min': min, - 'max': max, - 'rep_pen': koboldai_vars.rep_pen, - 'rep_pen_slope': koboldai_vars.rep_pen_slope, - 'rep_pen_range': koboldai_vars.rep_pen_range, - 'temperature': koboldai_vars.temp, - 'top_p': koboldai_vars.top_p, - 'top_k': koboldai_vars.top_k, - 'tfs': koboldai_vars.tfs, - 'typical': koboldai_vars.typical, - 'topa': koboldai_vars.top_a, - 'numseqs': koboldai_vars.numseqs, - 'retfultxt': False - } - - # Create request - req = requests.post( - koboldai_vars.colaburl, - json = reqdata - ) - - # Deal with the response - if(req.status_code == 200): - js = req.json()["data"] - - # Try to be backwards compatible with outdated colab - if("text" in js): - genout = [getnewcontent(js["text"])] - else: - genout = js["seqs"] - - for i in range(koboldai_vars.numseqs): - koboldai_vars.lua_koboldbridge.outputs[i+1] = genout[i] - - execute_outmod() - if(koboldai_vars.lua_koboldbridge.regeneration_required): - koboldai_vars.lua_koboldbridge.regeneration_required = False - genout = [] - for i in range(koboldai_vars.numseqs): - genout.append(koboldai_vars.lua_koboldbridge.outputs[i+1]) - assert type(genout[-1]) is str - - koboldai_vars.actions.clear_unused_options() - koboldai_vars.actions.append_options([applyoutputformatting(x["generated_text"]) for x in genout]) - genout = [{"generated_text": x['text']} for x in koboldai_vars.actions.get_current_options()] - if(len(genout) == 1): - - genresult(genout[0]) - else: - # Convert torch output format to transformers - seqs = [] - for seq in genout: - seqs.append({"generated_text": seq}) - if(koboldai_vars.lua_koboldbridge.restart_sequence is not None and koboldai_vars.lua_koboldbridge.restart_sequence > 0): - genresult(genout[koboldai_vars.lua_koboldbridge.restart_sequence-1]["generated_text"]) - else: - genselect(genout) - - # Format output before continuing - #genout = applyoutputformatting(getnewcontent(genout)) - - # Add formatted text to Actions array and refresh the game screen - #koboldai_vars.actions.append(genout) - #refresh_story() - #emit('from_server', {'cmd': 'texteffect', 'data': koboldai_vars.actions.get_last_key() + 1 if len(koboldai_vars.actions) else 0}) - - set_aibusy(0) - else: - errmsg = "Colab API Error: Failed to get a reply from the server. Please check the colab console." - print("{0}{1}{2}".format(colors.RED, errmsg, colors.END)) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True, room="UI_1") - set_aibusy(0) - - -#==================================================================# -# Send transformers-style request to KoboldAI API -#==================================================================# -def sendtoapi(txt, min, max): - # Log request to console - if not koboldai_vars.quiet: - print("{0}Tokens:{1}, Txt:{2}{3}".format(colors.YELLOW, min-1, txt, colors.END)) - - # Store context in memory to use it for comparison with generated content - koboldai_vars.lastctx = txt - - # Build request JSON data - reqdata = { - 'prompt': txt, - 'max_length': max - min + 1, - 'max_context_length': koboldai_vars.max_length, - 'rep_pen': koboldai_vars.rep_pen, - 'rep_pen_slope': koboldai_vars.rep_pen_slope, - 'rep_pen_range': koboldai_vars.rep_pen_range, - 'temperature': koboldai_vars.temp, - 'top_p': koboldai_vars.top_p, - 'top_k': koboldai_vars.top_k, - 'top_a': koboldai_vars.top_a, - 'tfs': koboldai_vars.tfs, - 'typical': koboldai_vars.typical, - 'n': koboldai_vars.numseqs, - } - - # Create request - while True: - req = requests.post( - koboldai_vars.colaburl[:-8] + "/api/v1/generate", - json=reqdata, - ) - if(req.status_code == 503): # Server is currently generating something else so poll until it's our turn - time.sleep(1) - continue - js = req.json() - if(req.status_code != 200): - errmsg = "KoboldAI API Error: Failed to get a reply from the server. Please check the console." - print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - emit("error", errmsg, broadcast=True, room="UI_2") - set_aibusy(0) - return - - genout = [obj["text"] for obj in js["results"]] - - for i in range(koboldai_vars.numseqs): - koboldai_vars.lua_koboldbridge.outputs[i+1] = genout[i] - - execute_outmod() - if(koboldai_vars.lua_koboldbridge.regeneration_required): - koboldai_vars.lua_koboldbridge.regeneration_required = False - genout = [] - for i in range(koboldai_vars.numseqs): - genout.append(koboldai_vars.lua_koboldbridge.outputs[i+1]) - assert type(genout[-1]) is str - - if(len(genout) == 1): - genresult(genout[0]) - else: - adjusted_genout = [] - for item in genout: - adjusted_genout.append({"generated_text": item}) - # Convert torch output format to transformers - seqs = [] - for seq in adjusted_genout: - seqs.append({"generated_text": seq}) - if(koboldai_vars.lua_koboldbridge.restart_sequence is not None and koboldai_vars.lua_koboldbridge.restart_sequence > 0): - genresult(adjusted_genout[koboldai_vars.lua_koboldbridge.restart_sequence-1]["generated_text"]) - else: - genselect(adjusted_genout) - - set_aibusy(0) - return - -#==================================================================# -# Send transformers-style request to KoboldAI Cluster -#==================================================================# -def sendtocluster(txt, min, max): - # Log request to console - if not koboldai_vars.quiet: - print("{0}Tokens:{1}, Txt:{2}{3}".format(colors.YELLOW, min-1, txt, colors.END)) - - # Store context in memory to use it for comparison with generated content - koboldai_vars.lastctx = txt - - # Build request JSON data - reqdata = { - 'max_length': max - min + 1, - 'max_context_length': koboldai_vars.max_length, - 'rep_pen': koboldai_vars.rep_pen, - 'rep_pen_slope': koboldai_vars.rep_pen_slope, - 'rep_pen_range': koboldai_vars.rep_pen_range, - 'temperature': koboldai_vars.temp, - 'top_p': koboldai_vars.top_p, - 'top_k': koboldai_vars.top_k, - 'top_a': koboldai_vars.top_a, - 'tfs': koboldai_vars.tfs, - 'typical': koboldai_vars.typical, - 'n': koboldai_vars.numseqs, - } - cluster_metadata = { - 'prompt': txt, - 'params': reqdata, - 'api_key': koboldai_vars.apikey, - 'models': koboldai_vars.cluster_requested_models, - } - try: - # Create request - req = requests.post( - koboldai_vars.colaburl[:-8] + "/api/v1/generate/sync", - json=cluster_metadata, - ) - js = req.json() - except requests.exceptions.ConnectionError: - errmsg = f"Horde unavailable. Please try again later" - print("{0}{1}{2}".format(colors.RED, errmsg, colors.END)) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - except requests.exceptions.JSONDecodeError: - errmsg = f"Unexpected message received from the Horde: '{req.text}'" - print("{0}{1}{2}".format(colors.RED, errmsg, colors.END)) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - if(req.status_code == 503): - errmsg = f"KoboldAI API Error: No available KoboldAI servers found in Horde to fulfil this request using the selected models or other properties." - print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - if(req.status_code != 200): - errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." - print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - gen_servers = [(cgen['server_name'],cgen['server_id']) for cgen in js] - print(f"{colors.GREEN}Generations by: {gen_servers}{colors.END}") - # Just in case we want to announce it to the user - if len(js) == 1: - warnmsg = f"Text generated by {js[0]['server_name']}" - emit('from_server', {'cmd': 'warnmsg', 'data': warnmsg}, broadcast=True) - genout = [cgen['text'] for cgen in js] - - for i in range(koboldai_vars.numseqs): - koboldai_vars.lua_koboldbridge.outputs[i+1] = genout[i] - - execute_outmod() - if(koboldai_vars.lua_koboldbridge.regeneration_required): - koboldai_vars.lua_koboldbridge.regeneration_required = False - genout = [] - for i in range(koboldai_vars.numseqs): - genout.append(koboldai_vars.lua_koboldbridge.outputs[i+1]) - assert type(genout[-1]) is str - - if(len(genout) == 1): - genresult(genout[0]) - else: - adjusted_genout = [] - for item in genout: - adjusted_genout.append({"generated_text": item}) - # Convert torch output format to transformers - seqs = [] - for seq in adjusted_genout: - seqs.append({"generated_text": seq}) - if(koboldai_vars.lua_koboldbridge.restart_sequence is not None and koboldai_vars.lua_koboldbridge.restart_sequence > 0): - genresult(adjusted_genout[koboldai_vars.lua_koboldbridge.restart_sequence-1]["generated_text"]) - else: - genselect(adjusted_genout) - - set_aibusy(0) - return - #==================================================================# # Send text to TPU mesh transformer backend #==================================================================# @@ -8242,7 +8174,7 @@ def UI_2_generate_raw(): return Response(json.dumps({"error": "No model"}), status=500) try: - out = raw_generate(prompt, max_length=80) + out = raw_generate(prompt, max_new=80) except NotImplementedError as e: return Response(json.dumps({"error": str(e)}), status=500) From 8f795b427002ceb48334be936218649c2a8bc2de Mon Sep 17 00:00:00 2001 From: somebody Date: Sat, 24 Sep 2022 16:24:31 -0500 Subject: [PATCH 31/31] Fix conflict ...and bugs --- aiserver.py | 401 +++++++++------------------------------------------- 1 file changed, 66 insertions(+), 335 deletions(-) diff --git a/aiserver.py b/aiserver.py index 0c6d7114..e2dd6905 100644 --- a/aiserver.py +++ b/aiserver.py @@ -5156,7 +5156,7 @@ def raw_generate( out_batches=batch_encoded, prompt=prompt_tokens, is_whole_generation=True ) elif koboldai_vars.model in model_functions: - model_functions[koboldai_vars.model]( + batch_encoded = model_functions[koboldai_vars.model]( prompt_tokens=prompt_tokens, max_new=max_new, batch_count=batch_count, @@ -5374,40 +5374,89 @@ def cluster_raw_generate( 'prompt': decoded_prompt, 'params': reqdata, 'api_key': koboldai_vars.apikey, - 'models': koboldai_vars.cluster_requested_models, + 'models': [x for x in koboldai_vars.cluster_requested_models if x], } try: # Create request req = requests.post( - koboldai_vars.colaburl[:-8] + "/api/v1/generate/sync", + koboldai_vars.colaburl[:-8] + "/api/v1/generate/async", json=cluster_metadata, ) - js = req.json() except requests.exceptions.ConnectionError: errmsg = f"Horde unavailable. Please try again later" - print("{0}{1}{2}".format(colors.RED, errmsg, colors.END)) + logger.error(errmsg) raise HordeException(errmsg) + + if req.status_code == 503: + errmsg = f"KoboldAI API Error: No available KoboldAI servers found in Horde to fulfil this request using the selected models or other properties." + logger.error(errmsg) + raise HordeException(errmsg) + elif not req.ok: + errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." + logger.error(errmsg) + raise HordeException(errmsg) + + try: + js = req.json() except requests.exceptions.JSONDecodeError: errmsg = f"Unexpected message received from the Horde: '{req.text}'" - print("{0}{1}{2}".format(colors.RED, errmsg, colors.END)) + logger.error(errmsg) raise HordeException(errmsg) - if(req.status_code == 503): - errmsg = f"KoboldAI API Error: No available KoboldAI servers found in Horde to fulfil this request using the selected models or other properties." - print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) - raise HordeException(errmsg) - if(req.status_code != 200): - errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." - print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) - raise HordeException(errmsg) - gen_servers = [(cgen['server_name'],cgen['server_id']) for cgen in js] - print(f"{colors.GREEN}Generations by: {gen_servers}{colors.END}") + + request_id = js["id"] + logger.debug("Horde Request ID: {}".format(request_id)) - # TODO: Fix this. Request context issues!! + # We've sent the request and got the ID back, now we need to watch it to see when it finishes + finished = False + + while not finished: + try: + req = requests.get(koboldai_vars.colaburl[:-8] + "/api/v1/generate/check/" + request_id) + except requests.exceptions.ConnectionError: + errmsg = f"Horde unavailable. Please try again later" + logger.error(errmsg) + raise HordeException(errmsg) + + if not req.ok: + errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." + logger.error(req.text) + raise HordeException(errmsg) + + try: + js = req.json() + except requests.exceptions.JSONDecodeError: + errmsg = f"Unexpected message received from the KoboldAI Horde: '{req.text}'" + logger.error(errmsg) + raise HordeException(errmsg) + + if "done" not in js: + errmsg = f"Unexpected response received from the KoboldAI Horde: '{js}'" + logger.error(errmsg ) + raise HordeException(errmsg) + + finished = js["done"] + koboldai_vars.horde_wait_time = js["wait_time"] + koboldai_vars.horde_queue_position = js["queue_position"] + koboldai_vars.horde_queue_size = js["waiting"] + + if not finished: + logger.debug(js) + time.sleep(1) + + logger.debug("Last Horde Status Message: {}".format(js)) + js = requests.get(koboldai_vars.colaburl[:-8] + "/api/v1/generate/prompt/" + request_id).json()['generations'] + logger.debug("Horde Result: {}".format(js)) + + gen_servers = [(cgen['server_name'],cgen['server_id']) for cgen in js] + logger.info(f"Generations by: {gen_servers}") + + # TODO: Fix this, using tpool so it's a context error # Just in case we want to announce it to the user # if len(js) == 1: # warnmsg = f"Text generated by {js[0]['server_name']}" # emit('from_server', {'cmd': 'warnmsg', 'data': warnmsg}, broadcast=True) + return np.array([tokenizer.encode(cgen["text"]) for cgen in js]) def colab_raw_generate( @@ -5512,7 +5561,6 @@ def api_raw_generate( #==================================================================# def generate(txt, minimum, maximum, found_entries=None): - print("ring ring", txt, minimum, maximum, found_entries) koboldai_vars.generated_tkns = 0 if(found_entries is None): @@ -5662,323 +5710,6 @@ def pinsequence(n): #==================================================================# -# BEGIN CONFLICT -# Send transformers-style request to ngrok/colab host -#==================================================================# -def sendtocolab(txt, min, max): - # Log request to console - if not koboldai_vars.quiet: - print("{0}Tokens:{1}, Txt:{2}{3}".format(colors.YELLOW, min-1, txt, colors.END)) - - # Store context in memory to use it for comparison with generated content - koboldai_vars.lastctx = txt - - # Build request JSON data - reqdata = { - 'text': txt, - 'min': min, - 'max': max, - 'rep_pen': koboldai_vars.rep_pen, - 'rep_pen_slope': koboldai_vars.rep_pen_slope, - 'rep_pen_range': koboldai_vars.rep_pen_range, - 'temperature': koboldai_vars.temp, - 'top_p': koboldai_vars.top_p, - 'top_k': koboldai_vars.top_k, - 'tfs': koboldai_vars.tfs, - 'typical': koboldai_vars.typical, - 'topa': koboldai_vars.top_a, - 'numseqs': koboldai_vars.numseqs, - 'retfultxt': False - } - - # Create request - req = requests.post( - koboldai_vars.colaburl, - json = reqdata - ) - - # Deal with the response - if(req.status_code == 200): - js = req.json()["data"] - - # Try to be backwards compatible with outdated colab - if("text" in js): - genout = [getnewcontent(js["text"])] - else: - genout = js["seqs"] - - for i in range(koboldai_vars.numseqs): - koboldai_vars.lua_koboldbridge.outputs[i+1] = genout[i] - - execute_outmod() - if(koboldai_vars.lua_koboldbridge.regeneration_required): - koboldai_vars.lua_koboldbridge.regeneration_required = False - genout = [] - for i in range(koboldai_vars.numseqs): - genout.append(koboldai_vars.lua_koboldbridge.outputs[i+1]) - assert type(genout[-1]) is str - - koboldai_vars.actions.clear_unused_options() - koboldai_vars.actions.append_options([applyoutputformatting(x["generated_text"]) for x in genout]) - genout = [{"generated_text": x['text']} for x in koboldai_vars.actions.get_current_options()] - if(len(genout) == 1): - - genresult(genout[0]) - else: - # Convert torch output format to transformers - seqs = [] - for seq in genout: - seqs.append({"generated_text": seq}) - if(koboldai_vars.lua_koboldbridge.restart_sequence is not None and koboldai_vars.lua_koboldbridge.restart_sequence > 0): - genresult(genout[koboldai_vars.lua_koboldbridge.restart_sequence-1]["generated_text"]) - else: - genselect(genout) - - # Format output before continuing - #genout = applyoutputformatting(getnewcontent(genout)) - - # Add formatted text to Actions array and refresh the game screen - #koboldai_vars.actions.append(genout) - #refresh_story() - #emit('from_server', {'cmd': 'texteffect', 'data': koboldai_vars.actions.get_last_key() + 1 if len(koboldai_vars.actions) else 0}) - - set_aibusy(0) - else: - errmsg = "Colab API Error: Failed to get a reply from the server. Please check the colab console." - print("{0}{1}{2}".format(colors.RED, errmsg, colors.END)) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True, room="UI_1") - set_aibusy(0) - - -#==================================================================# -# Send transformers-style request to KoboldAI API -#==================================================================# -def sendtoapi(txt, min, max): - # Log request to console - if not koboldai_vars.quiet: - print("{0}Tokens:{1}, Txt:{2}{3}".format(colors.YELLOW, min-1, txt, colors.END)) - - # Store context in memory to use it for comparison with generated content - koboldai_vars.lastctx = txt - - # Build request JSON data - reqdata = { - 'prompt': txt, - 'max_length': max - min + 1, - 'max_context_length': koboldai_vars.max_length, - 'rep_pen': koboldai_vars.rep_pen, - 'rep_pen_slope': koboldai_vars.rep_pen_slope, - 'rep_pen_range': koboldai_vars.rep_pen_range, - 'temperature': koboldai_vars.temp, - 'top_p': koboldai_vars.top_p, - 'top_k': koboldai_vars.top_k, - 'top_a': koboldai_vars.top_a, - 'tfs': koboldai_vars.tfs, - 'typical': koboldai_vars.typical, - 'n': koboldai_vars.numseqs, - } - - # Create request - while True: - req = requests.post( - koboldai_vars.colaburl[:-8] + "/api/v1/generate", - json=reqdata, - ) - if(req.status_code == 503): # Server is currently generating something else so poll until it's our turn - time.sleep(1) - continue - js = req.json() - if(req.status_code != 200): - errmsg = "KoboldAI API Error: Failed to get a reply from the server. Please check the console." - print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - emit("error", errmsg, broadcast=True, room="UI_2") - set_aibusy(0) - return - - genout = [obj["text"] for obj in js["results"]] - - for i in range(koboldai_vars.numseqs): - koboldai_vars.lua_koboldbridge.outputs[i+1] = genout[i] - - execute_outmod() - if(koboldai_vars.lua_koboldbridge.regeneration_required): - koboldai_vars.lua_koboldbridge.regeneration_required = False - genout = [] - for i in range(koboldai_vars.numseqs): - genout.append(koboldai_vars.lua_koboldbridge.outputs[i+1]) - assert type(genout[-1]) is str - - if(len(genout) == 1): - genresult(genout[0]) - else: - adjusted_genout = [] - for item in genout: - adjusted_genout.append({"generated_text": item}) - # Convert torch output format to transformers - seqs = [] - for seq in adjusted_genout: - seqs.append({"generated_text": seq}) - if(koboldai_vars.lua_koboldbridge.restart_sequence is not None and koboldai_vars.lua_koboldbridge.restart_sequence > 0): - genresult(adjusted_genout[koboldai_vars.lua_koboldbridge.restart_sequence-1]["generated_text"]) - else: - genselect(adjusted_genout) - - set_aibusy(0) - return - -#==================================================================# -# Send transformers-style request to KoboldAI Cluster -#==================================================================# -def sendtocluster(txt, min, max): - # Log request to console - if not koboldai_vars.quiet: - logger.debug(f"Tokens Min:{min-1}") - logger.prompt(txt.encode("unicode_escape").decode("utf-8")) - - # Store context in memory to use it for comparison with generated content - koboldai_vars.lastctx = txt - # Build request JSON data - reqdata = { - 'max_length': max - min + 1, - 'max_context_length': koboldai_vars.max_length, - 'rep_pen': koboldai_vars.rep_pen, - 'rep_pen_slope': koboldai_vars.rep_pen_slope, - 'rep_pen_range': koboldai_vars.rep_pen_range, - 'temperature': koboldai_vars.temp, - 'top_p': koboldai_vars.top_p, - 'top_k': koboldai_vars.top_k, - 'top_a': koboldai_vars.top_a, - 'tfs': koboldai_vars.tfs, - 'typical': koboldai_vars.typical, - 'n': koboldai_vars.numseqs, - } - cluster_metadata = { - 'prompt': txt, - 'params': reqdata, - 'api_key': koboldai_vars.apikey, - 'models': koboldai_vars.cluster_requested_models, - } - if cluster_metadata["models"] == [""]: - cluster_metadata["models"] = [] - logger.debug(f"Horde Payload: {cluster_metadata}") - try: - # Create request - req = requests.post( - koboldai_vars.colaburl[:-8] + "/api/v1/generate/async", - json=cluster_metadata, - ) - except requests.exceptions.ConnectionError: - errmsg = f"Horde unavailable. Please try again later" - logger.error(errmsg) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - if(req.status_code == 503): - errmsg = f"KoboldAI API Error: No available KoboldAI servers found in Horde to fulfil this request using the selected models or other properties." - logger.error(req.text) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - if(not req.ok): - errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." - logger.error(req.text) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - - try: - js = req.json() - except requests.exceptions.JSONDecodeError: - errmsg = f"Unexpected message received from the KoboldAI Horde: '{req.text}'" - logger.error(errmsg) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - request_id = js['id'] - logger.debug("Horde Request ID: {}".format(request_id)) - #We've sent the request and got the ID back, now we need to watch it to see when it finishes - finished = False - while not finished: - try: - req = requests.get(koboldai_vars.colaburl[:-8] + "/api/v1/generate/check/" + request_id) - except requests.exceptions.ConnectionError: - errmsg = f"Horde unavailable. Please try again later" - logger.error(errmsg) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - if(not req.ok): - errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console." - logger.error(req.text) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - try: - js = req.json() - except requests.exceptions.JSONDecodeError: - errmsg = f"Unexpected message received from the KoboldAI Horde: '{req.text}'" - logger.error(errmsg) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - if not "done" in js: - errmsg = f"Unexpected response received from the KoboldAI Horde: '{js}'" - logger.error(errmsg ) - emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) - set_aibusy(0) - return - finished = js["done"] - koboldai_vars.horde_wait_time = js["wait_time"] - koboldai_vars.horde_queue_position = js["queue_position"] - koboldai_vars.horde_queue_size = js["waiting"] - if not finished: - logger.debug(js) - time.sleep(1) - - logger.debug("Last Horde Status Message: {}".format(js)) - js = requests.get(koboldai_vars.colaburl[:-8] + "/api/v1/generate/prompt/" + request_id).json()['generations'] - logger.debug("Horde Result: {}".format(js)) - - gen_servers = [(cgen['server_name'],cgen['server_id']) for cgen in js] - logger.info(f"Generations by: {gen_servers}") - # Just in case we want to announce it to the user - if len(js) == 1: - warnmsg = f"Text generated by {js[0]['server_name']}" - emit('from_server', {'cmd': 'warnmsg', 'data': warnmsg}, broadcast=True) - genout = [cgen['text'] for cgen in js] - - for i in range(koboldai_vars.numseqs): - koboldai_vars.lua_koboldbridge.outputs[i+1] = genout[i] - - execute_outmod() - if(koboldai_vars.lua_koboldbridge.regeneration_required): - koboldai_vars.lua_koboldbridge.regeneration_required = False - genout = [] - for i in range(koboldai_vars.numseqs): - genout.append(koboldai_vars.lua_koboldbridge.outputs[i+1]) - assert type(genout[-1]) is str - - if(len(genout) == 1): - genresult(genout[0]) - else: - adjusted_genout = [] - for item in genout: - adjusted_genout.append({"generated_text": item}) - # Convert torch output format to transformers - seqs = [] - for seq in adjusted_genout: - seqs.append({"generated_text": seq}) - if(koboldai_vars.lua_koboldbridge.restart_sequence is not None and koboldai_vars.lua_koboldbridge.restart_sequence > 0): - genresult(adjusted_genout[koboldai_vars.lua_koboldbridge.restart_sequence-1]["generated_text"]) - else: - genselect(adjusted_genout) - - set_aibusy(0) - return - -#==================================================================# -# END CONFLICT # Send text to TPU mesh transformer backend #==================================================================# def tpumtjgenerate(txt, minimum, maximum, found_entries=None):