Experiment

2025-06-05 21:59:24 +02:00 · 2023-04-27 20:28:04 -05:00
parent cd1eb97c2a
commit ffa7b22734
1 changed files with 4 additions and 3 deletions
--- a/modeling/tokenizer.py
+++ b/modeling/tokenizer.py
@@ -29,15 +29,16 @@ class GenericTokenizer:
        return ret.ids
    def decode(self, tokens: Union[int, List[int], torch.Tensor]) -> str:
        return self.tokenizer.decode(tokens)
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.cpu().tolist()
        if isinstance(tokens, int):
            tokens = [tokens]
-        for t in tokens:
+        # Sometimes soft token placeholders aren't in the vocab, which causes
-            if t not in self.valid_tokens:
+        # errors on decode. Obviously we can't express these tokens as text so
-                print(f"WHAT ON EARTH IS {t}")
+        # we can probably slice 'em out without too much issue
        tokens = [t for t in tokens if t in self.valid_tokens]
        return self.tokenizer.decode(tokens)