Merge remote-tracking branch 'origin/united' into model-structure-update

2025-06-05 21:59:24 +02:00 · 2023-05-04 07:31:13 +02:00
parent 1166c07bc3 a87d5d6f23
commit ef358fdf5a
5 changed files with 110 additions and 20 deletions
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -20,8 +20,89 @@ class HFInferenceModel(InferenceModel):
    def _post_load(self) -> None:
        # These are model specific tokenizer overrides if a model has bad defaults
        if utils.koboldai_vars.model_type == "llama":
-            self.tokenizer.decode_with_prefix_space = True
+            # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
            self.tokenizer.add_bos_token = False
+
+            # HF transformers no longer supports decode_with_prefix_space
+            # We work around this by wrapping decode, encode, and __call__
+            # with versions that work around the 'prefix space' misfeature
+            # of sentencepiece.
+            vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size))
+            has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")}
+
+            # Wrap 'decode' with a method that always returns text starting with a space
+            # when the head token starts with a space. This is what 'decode_with_prefix_space'
+            # used to do, and we implement it using the same technique (building a cache of
+            # tokens that should have a prefix space, and then prepending a space if the first
+            # token is in this set.) We also work around a bizarre behavior in which decoding
+            # a single token 13 behaves differently than decoding a squence containing only [13].
+            original_decode = type(self.tokenizer.tokenizer).decode
+            def decode_wrapper(self, token_ids, *args, **kwargs):
+                first = None
+                # Note, the code below that wraps single-value token_ids in a list
+                # is to work around this wonky behavior:
+                #   >>> t.decode(13)
+                #   '<0x0A>'
+                #   >>> t.decode([13])
+                #   '\n'
+                # Not doing this causes token streaming to receive <0x0A> characters
+                # instead of newlines.
+                if isinstance(token_ids, int):
+                    first = token_ids
+                    token_ids = [first]
+                elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor
+                    # Tensors don't support the Python standard of 'empty is False'
+                    # and the special case of dimension 0 tensors also needs to be
+                    # handled separately.
+                    if token_ids.dim() == 0:
+                        first = int(token_ids.item())
+                        token_ids = [first]
+                    elif len(token_ids) > 0:
+                        first = int(token_ids[0])
+                elif token_ids:
+                    first = token_ids[0]
+                result = original_decode(self, token_ids, *args, **kwargs)
+                if first is not None and first in has_prefix_space:
+                    result = " " + result
+                return result
+            # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it
+            object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer))
+
+            # Wrap encode and __call__ to work around the 'prefix space' misfeature also.
+            # The problem is that "Bob" at the start of text is encoded as if it is
+            # " Bob". This creates a problem because it means you can't split text, encode
+            # the pieces, concatenate the tokens, decode them, and get the original text back.
+            # The workaround is to prepend a known token that (1) starts with a space; and
+            # (2) is not the prefix of any other token. After searching through the vocab
+            # " ," (space comma) is the only token containing only printable ascii characters
+            # that fits this bill. By prepending ',' to the text, the original encode
+            # method always returns [1919, ...], where the tail of the sequence is the
+            # actual encoded result we want without the prefix space behavior.
+            original_encode = type(self.tokenizer.tokenizer).encode
+            def encode_wrapper(self, text, *args, **kwargs):
+                if type(text) is str:
+                    text = ',' + text
+                    result = original_encode(self, text, *args, **kwargs)
+                    result = result[1:]
+                else:
+                    result = original_encode(self, text, *args, **kwargs)
+                return result
+            object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer))
+
+            # Since 'encode' is documented as being deprecated, also override __call__.
+            # This doesn't appear to currently be used by KoboldAI, but doing so
+            # in case someone uses it in the future.
+            original_call = type(self.tokenizer.tokenizer).__call__
+            def call_wrapper(self, text, *args, **kwargs):
+                if type(text) is str:
+                    text = ',' + text
+                    result = original_call(self, text, *args, **kwargs)
+                    result = result[1:]
+                else:
+                    result = original_call(self, text, *args, **kwargs)
+                return result
+            object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
+
        elif utils.koboldai_vars.model_type == "opt":
            self.tokenizer._koboldai_header = self.tokenizer.encode("")
            self.tokenizer.add_bos_token = False