From 070cfd339a27ee1ef62fd74a495fb34b80a4920c Mon Sep 17 00:00:00 2001 From: Llama <34464159+pi6am@users.noreply.github.com> Date: Sat, 19 Aug 2023 17:40:23 -0700 Subject: [PATCH] Strip the eos token from exllama generations. The end-of-sequence () token indicates the end of a generation. When a token sequence containing is decoded, an extra (wrong) space is inserted at the beginning of the generation. To avoid this, strip the eos token out of the result before returning it. The eos token was getting stripped later, so this doesn't change the output except to avoid the spurious leading space. --- modeling/inference_models/exllama/class.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index e3c7a874..508c6a79 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -282,6 +282,7 @@ class model_backend(InferenceModel): self.generator.gen_begin_reuse(gen_in) + trim_count = 0 for i in range(max_new): logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache) logits[:, :, self.tokenizer.bos_token_id] = -10000.0 @@ -300,15 +301,19 @@ class model_backend(InferenceModel): utils.koboldai_vars.generated_tkns += 1 - if token.item() == self.tokenizer.eos_token_id: break + if token.item() == self.tokenizer.eos_token_id: + trim_count = 1 + break - utils.koboldai_vars.generated_tkns = max_new + utils.koboldai_vars.generated_tkns = max_new - trim_count + if trim_count > 0: + seq = self.generator.sequence[:, gen_in.size(1):-trim_count] + else: + seq = self.generator.sequence[:, gen_in.size(1):] return GenerationResult( model=self, - out_batches=np.array( - self.generator.sequence[:, gen_in.size(1):], - ), + out_batches=np.array(seq,), prompt=prompt_tokens, is_whole_generation=True, single_line=single_line,