From ffa7b227349cd6f512b3dcf9397f84e60d07b031 Mon Sep 17 00:00:00 2001 From: somebody Date: Thu, 27 Apr 2023 20:28:04 -0500 Subject: [PATCH] Experiment --- modeling/tokenizer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modeling/tokenizer.py b/modeling/tokenizer.py index 17e402fb..26b619f6 100644 --- a/modeling/tokenizer.py +++ b/modeling/tokenizer.py @@ -29,15 +29,16 @@ class GenericTokenizer: return ret.ids def decode(self, tokens: Union[int, List[int], torch.Tensor]) -> str: + return self.tokenizer.decode(tokens) if isinstance(tokens, torch.Tensor): tokens = tokens.cpu().tolist() if isinstance(tokens, int): tokens = [tokens] - for t in tokens: - if t not in self.valid_tokens: - print(f"WHAT ON EARTH IS {t}") + # Sometimes soft token placeholders aren't in the vocab, which causes + # errors on decode. Obviously we can't express these tokens as text so + # we can probably slice 'em out without too much issue tokens = [t for t in tokens if t in self.valid_tokens] return self.tokenizer.decode(tokens)