This commit is contained in:
somebody
2023-04-27 20:12:29 -05:00
parent 4559112551
commit cd1eb97c2a

View File

@@ -9,6 +9,7 @@ class GenericTokenizer:
def __init__(self, tokenizer: Union[Tokenizer, PreTrainedTokenizer]) -> None: def __init__(self, tokenizer: Union[Tokenizer, PreTrainedTokenizer]) -> None:
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.valid_tokens = set(self.tokenizer.vocab.values())
def __getattr__(self, name: str) -> Any: def __getattr__(self, name: str) -> Any:
# Fall back to tokenizer for non-generic stuff # Fall back to tokenizer for non-generic stuff
@@ -33,5 +34,10 @@ class GenericTokenizer:
if isinstance(tokens, int): if isinstance(tokens, int):
tokens = [tokens] tokens = [tokens]
for t in tokens:
if t not in self.valid_tokens:
print(f"WHAT ON EARTH IS {t}")
tokens = [t for t in tokens if t in self.valid_tokens]
return self.tokenizer.decode(tokens, skip_special_tokens=True) return self.tokenizer.decode(tokens)