From 070cfd339a27ee1ef62fd74a495fb34b80a4920c Mon Sep 17 00:00:00 2001
From: Llama <34464159+pi6am@users.noreply.github.com>
Date: Sat, 19 Aug 2023 17:40:23 -0700
Subject: [PATCH] Strip the eos token from exllama generations.

The end-of-sequence (</s>) token indicates the end of a generation.
When a token sequence containing </s> is decoded, an extra (wrong)
space is inserted at the beginning of the generation. To avoid this,
strip the eos token out of the result before returning it.
The eos token was getting stripped later, so this doesn't change
the output except to avoid the spurious leading space.
---
 modeling/inference_models/exllama/class.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py
index e3c7a874..508c6a79 100644
--- a/modeling/inference_models/exllama/class.py
+++ b/modeling/inference_models/exllama/class.py
@@ -282,6 +282,7 @@ class model_backend(InferenceModel):
 
         self.generator.gen_begin_reuse(gen_in)
 
+        trim_count = 0
         for i in range(max_new):
             logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache)
             logits[:, :, self.tokenizer.bos_token_id] = -10000.0
@@ -300,15 +301,19 @@ class model_backend(InferenceModel):
 
             utils.koboldai_vars.generated_tkns += 1
 
-            if token.item() == self.tokenizer.eos_token_id: break
+            if token.item() == self.tokenizer.eos_token_id:
+                trim_count = 1
+                break
 
-        utils.koboldai_vars.generated_tkns = max_new
+        utils.koboldai_vars.generated_tkns = max_new - trim_count
+        if trim_count > 0:
+            seq = self.generator.sequence[:, gen_in.size(1):-trim_count]
+        else:
+            seq = self.generator.sequence[:, gen_in.size(1):]
 
         return GenerationResult(
             model=self,
-            out_batches=np.array(
-                self.generator.sequence[:, gen_in.size(1):],
-            ),
+            out_batches=np.array(seq,),
             prompt=prompt_tokens,
             is_whole_generation=True,
             single_line=single_line,