Final touches

2025-06-05 21:59:24 +02:00 · 2022-09-24 12:54:20 -05:00
parent 3a727bc381
commit 5cdeb79752
1 changed files with 3 additions and 18 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -4826,9 +4826,6 @@ def calcsubmit(txt):
        # Send it!
        ikrequest(subtxt)
 def __debug(*args):
    print("[DBG] ", *args)
 def core_generate(text: list, min: int, max: int, found_entries: set):
    # This generation function is tangled with koboldai_vars intentionally. It
    # is meant for the story and nothing else.
@@ -4866,13 +4863,11 @@ def core_generate(text: list, min: int, max: int, found_entries: set):
    koboldai_vars._prompt = koboldai_vars.prompt
    __debug("generate core", text)
    with torch.no_grad():
        already_generated = 0
        numseqs = koboldai_vars.numseqs
        while True:
            __debug("generate loop start", text)
            # The reason this is a loop is due to how Dynamic WI works. We
            # cannot simply add the WI to the context mid-generation, so we
            # stop early, and then insert WI, then continue generating. That
@@ -4888,15 +4883,12 @@ def core_generate(text: list, min: int, max: int, found_entries: set):
                bypass_hf_maxlength=True,
            )
            __debug("generate result", result.__dict__)
            genout = result.encoded
-            already_generated += len(genout[0]) - 1 # - len(gen_in[0])
+            already_generated += len(genout[0]) - 1
            assert already_generated <= koboldai_vars.genamt
            if result.is_whole_generation:
                __debug("Outa here")
                break
            # Generation stopped; why?
@@ -4953,13 +4945,9 @@ def core_generate(text: list, min: int, max: int, found_entries: set):
                )
                genout = torch.cat((soft_tokens.tile(koboldai_vars.numseqs, 1), genout), dim=-1)
            assert genout.shape[-1] + koboldai_vars.genamt - already_generated <= koboldai_vars.max_length
            # diff = genout.shape[-1] - gen_in.shape[-1]
            # minimum += diff
            # maximum += diff
            gen_in = genout
            numseqs = 1
    __debug("final out", genout, "already_gen", already_generated)
    return genout, already_generated
 class GenerationResult:
@@ -4975,10 +4963,8 @@ class GenerationResult:
        # Controls if we should trim output by prompt length
        output_includes_prompt: bool = False,
    ):
-        # Shave prompt off of encoded response. Decoded does not return prompt.
+        # Shave prompt off of encoded response when needed (HF). Decoded does
-        # TODO: Does MTJ generation shave this off automatically? Test it!
+        # not return prompt.
        print("shape", out_batches.shape)
        if output_includes_prompt:
            self.encoded = out_batches[:, len(prompt) - 1:]
        else:
@@ -5065,7 +5051,6 @@ def tpu_raw_generate(
    # Mostly lifted from apiactionsubmit_tpumtjgenerate
    soft_tokens = tpumtjgetsofttokens()
    __debug("we are generating with", prompt_tokens, "batch", batch_count, "soft tokens", soft_tokens)
    genout = tpool.execute(
        tpu_mtj_backend.infer_static,