Fix tokenization and whitespace issues with llama-derived models

Work around the 'soft' prefix space behavior of sentencepiece. Override encode to restore the deleted HF support for decode_with_prefix_space. Override decode to skip the soft space and return true decoded tokens. Allow submitting chat messages with embedded newlines. Split sentences between punctuation and whitespace, rather than after whitespace. Also include trailing quotes and brackets after sentence stoppers. This avoids splitting ." and .) into two tokens, for instance. Insert whitespace at the beginning of the author's note, since sentences are split with leading whitespace. Remove spurious newlines at the end of chat responses.
2025-06-05 21:59:24 +02:00 · 2023-05-03 01:27:11 -07:00
parent 507da6fcf7
commit 3768848548
4 changed files with 94 additions and 11 deletions
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -397,8 +397,8 @@ class koboldai_vars(object):
        
        ######################################### Setup Author's Note Data ########################################################
        authors_note_text = self.authornotetemplate.replace("<|>", self.authornote)
-        if len(authors_note_text) > 0 and authors_note_text[-1] not in [" ", "\n"]:
-            authors_note_text += " "
+        if len(authors_note_text) > 0 and authors_note_text[0] not in [" ", "\n"]:
+            authors_note_text = " " + authors_note_text
        authors_note_data = [[x, self.tokenizer.decode(x)] for x in self.tokenizer.encode(authors_note_text)]
        if used_tokens + len(authors_note_data) <= token_budget:
            used_tokens += len(authors_note_data)
@@ -1384,7 +1384,11 @@ class KoboldStoryRegister(object):
        self.action_count = -1
        # The id of the last submission action, or 0 if the last append was not a submission
        self.submission_id = 0
-        self.sentence_re = re.compile(r"[^.!?]*[.!?]+\"?\s*", re.S)
+        # A regular expression used to break the story into sentences so that the author's
+        # note can be inserted with minimal disruption. Avoid ending a sentance with
+        # whitespace because most tokenizers deal better with whitespace at the beginning of text.
+        # Search for sentence end delimeters (i.e. .!?) and also capture closing parenthesis and quotes.
+        self.sentence_re = re.compile(r".*?[.!?]+(?=[.!?\]\)}>'\"›»\s])[.!?\]\)}>'\"›»]*", re.S)
        self.story_settings = story_settings
        self.tts_model = None
        self.tortoise = None