Strip the eos token from exllama generations.

The end-of-sequence (</s>) token indicates the end of a generation.
When a token sequence containing </s> is decoded, an extra (wrong)
space is inserted at the beginning of the generation. To avoid this,
strip the eos token out of the result before returning it.
The eos token was getting stripped later, so this doesn't change
the output except to avoid the spurious leading space.
This commit is contained in:
Llama
2023-08-19 17:40:23 -07:00
parent 973aea12ea
commit 070cfd339a

View File

@@ -282,6 +282,7 @@ class model_backend(InferenceModel):
self.generator.gen_begin_reuse(gen_in)
trim_count = 0
for i in range(max_new):
logits = self.model.forward(self.generator.sequence[:, -1:], self.generator.cache)
logits[:, :, self.tokenizer.bos_token_id] = -10000.0
@@ -300,15 +301,19 @@ class model_backend(InferenceModel):
utils.koboldai_vars.generated_tkns += 1
if token.item() == self.tokenizer.eos_token_id: break
if token.item() == self.tokenizer.eos_token_id:
trim_count = 1
break
utils.koboldai_vars.generated_tkns = max_new
utils.koboldai_vars.generated_tkns = max_new - trim_count
if trim_count > 0:
seq = self.generator.sequence[:, gen_in.size(1):-trim_count]
else:
seq = self.generator.sequence[:, gen_in.size(1):]
return GenerationResult(
model=self,
out_batches=np.array(
self.generator.sequence[:, gen_in.size(1):],
),
out_batches=np.array(seq,),
prompt=prompt_tokens,
is_whole_generation=True,
single_line=single_line,