Strip the eos token from exllama generations.

The end-of-sequence (</s>) token indicates the end of a generation. When a token sequence containing </s> is decoded, an extra (wrong) space is inserted at the beginning of the generation. To avoid this, strip the eos token out of the result before returning it. The eos token was getting stripped later, so this doesn't change the output except to avoid the spurious leading space.
2025-06-05 21:59:24 +02:00 · 2023-08-19 17:40:23 -07:00
parent 973aea12ea
commit 070cfd339a
1 changed files with 10 additions and 5 deletions
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -282,6 +282,7 @@ class model_backend(InferenceModel):

        self.generator.gen_begin_reuse(gen_in)

+        trim_count = 0
        for i in range(max_new):
            logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache)
            logits[:, :, self.tokenizer.bos_token_id] = -10000.0
@@ -300,15 +301,19 @@ class model_backend(InferenceModel):

            utils.koboldai_vars.generated_tkns += 1

-            if token.item() == self.tokenizer.eos_token_id: break
+            if token.item() == self.tokenizer.eos_token_id:
+                trim_count = 1
+                break

-        utils.koboldai_vars.generated_tkns = max_new
+        utils.koboldai_vars.generated_tkns = max_new - trim_count
+        if trim_count > 0:
+            seq = self.generator.sequence[:, gen_in.size(1):-trim_count]
+        else:
+            seq = self.generator.sequence[:, gen_in.size(1):]

        return GenerationResult(
            model=self,
-            out_batches=np.array(
-                self.generator.sequence[:, gen_in.size(1):],
-            ),
+            out_batches=np.array(seq,),
            prompt=prompt_tokens,
            is_whole_generation=True,
            single_line=single_line,