Merge remote-tracking branch 'origin/united' into model-structure-update

2025-06-05 21:59:24 +02:00 · 2023-05-04 07:31:13 +02:00
parent 1166c07bc3 a87d5d6f23
commit ef358fdf5a
5 changed files with 110 additions and 20 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -3555,7 +3555,7 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False,
                botname = (koboldai_vars.botname + ":")
            else:
                botname = ""
-            data = re.sub(r'\n+', ' ', data)
+            data = re.sub(r'\n+\Z', '', data)
            if(len(data)):
                data = f"\n{koboldai_vars.chatname}: {data}\n{botname}"
        
@@ -6354,7 +6354,7 @@ def UI_2_download_story():
@logger.catch
 def UI_2_Set_Selected_Text(data):
    if not koboldai_vars.quiet:
-        print("Updating Selected Text: {}".format(data))
+        logger.info("Updating Selected Text: {}".format(data))
    action_id = int(data["id"])

    if not koboldai_vars.actions.actions[action_id].get("Original Text"):
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -397,8 +397,8 @@ class koboldai_vars(object):
        
        ######################################### Setup Author's Note Data ########################################################
        authors_note_text = self.authornotetemplate.replace("<|>", self.authornote)
-        if len(authors_note_text) > 0 and authors_note_text[-1] not in [" ", "\n"]:
-            authors_note_text += " "
+        if len(authors_note_text) > 0 and authors_note_text[0] not in [" ", "\n"]:
+            authors_note_text = " " + authors_note_text
        authors_note_data = [[x, self.tokenizer.decode(x)] for x in self.tokenizer.encode(authors_note_text)]
        if used_tokens + len(authors_note_data) <= token_budget:
            used_tokens += len(authors_note_data)
@@ -1393,7 +1393,11 @@ class KoboldStoryRegister(object):
        self.action_count = -1
        # The id of the last submission action, or 0 if the last append was not a submission
        self.submission_id = 0
-        self.sentence_re = re.compile(r"[^.!?]*[.!?]+\"?\s*", re.S)
+        # A regular expression used to break the story into sentences so that the author's
+        # note can be inserted with minimal disruption. Avoid ending a sentance with
+        # whitespace because most tokenizers deal better with whitespace at the beginning of text.
+        # Search for sentence end delimeters (i.e. .!?) and also capture closing parenthesis and quotes.
+        self.sentence_re = re.compile(r".*?[.!?]+(?=[.!?\]\)}>'\"›»\s])[.!?\]\)}>'\"›»]*", re.S)
        self.story_settings = story_settings
        self.tts_model = None
        self.tortoise = None
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -20,8 +20,89 @@ class HFInferenceModel(InferenceModel):
    def _post_load(self) -> None:
        # These are model specific tokenizer overrides if a model has bad defaults
        if utils.koboldai_vars.model_type == "llama":
-            self.tokenizer.decode_with_prefix_space = True
+            # Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
            self.tokenizer.add_bos_token = False
+
+            # HF transformers no longer supports decode_with_prefix_space
+            # We work around this by wrapping decode, encode, and __call__
+            # with versions that work around the 'prefix space' misfeature
+            # of sentencepiece.
+            vocab = self.tokenizer.convert_ids_to_tokens(range(self.tokenizer.vocab_size))
+            has_prefix_space = {i for i, tok in enumerate(vocab) if tok.startswith("▁")}
+
+            # Wrap 'decode' with a method that always returns text starting with a space
+            # when the head token starts with a space. This is what 'decode_with_prefix_space'
+            # used to do, and we implement it using the same technique (building a cache of
+            # tokens that should have a prefix space, and then prepending a space if the first
+            # token is in this set.) We also work around a bizarre behavior in which decoding
+            # a single token 13 behaves differently than decoding a squence containing only [13].
+            original_decode = type(self.tokenizer.tokenizer).decode
+            def decode_wrapper(self, token_ids, *args, **kwargs):
+                first = None
+                # Note, the code below that wraps single-value token_ids in a list
+                # is to work around this wonky behavior:
+                #   >>> t.decode(13)
+                #   '<0x0A>'
+                #   >>> t.decode([13])
+                #   '\n'
+                # Not doing this causes token streaming to receive <0x0A> characters
+                # instead of newlines.
+                if isinstance(token_ids, int):
+                    first = token_ids
+                    token_ids = [first]
+                elif hasattr(token_ids, 'dim'): # Check for e.g. torch.Tensor
+                    # Tensors don't support the Python standard of 'empty is False'
+                    # and the special case of dimension 0 tensors also needs to be
+                    # handled separately.
+                    if token_ids.dim() == 0:
+                        first = int(token_ids.item())
+                        token_ids = [first]
+                    elif len(token_ids) > 0:
+                        first = int(token_ids[0])
+                elif token_ids:
+                    first = token_ids[0]
+                result = original_decode(self, token_ids, *args, **kwargs)
+                if first is not None and first in has_prefix_space:
+                    result = " " + result
+                return result
+            # GenericTokenizer overrides __setattr__ so we need to use object.__setattr__ to bypass it
+            object.__setattr__(self.tokenizer, 'decode', decode_wrapper.__get__(self.tokenizer))
+
+            # Wrap encode and __call__ to work around the 'prefix space' misfeature also.
+            # The problem is that "Bob" at the start of text is encoded as if it is
+            # " Bob". This creates a problem because it means you can't split text, encode
+            # the pieces, concatenate the tokens, decode them, and get the original text back.
+            # The workaround is to prepend a known token that (1) starts with a space; and
+            # (2) is not the prefix of any other token. After searching through the vocab
+            # " ," (space comma) is the only token containing only printable ascii characters
+            # that fits this bill. By prepending ',' to the text, the original encode
+            # method always returns [1919, ...], where the tail of the sequence is the
+            # actual encoded result we want without the prefix space behavior.
+            original_encode = type(self.tokenizer.tokenizer).encode
+            def encode_wrapper(self, text, *args, **kwargs):
+                if type(text) is str:
+                    text = ',' + text
+                    result = original_encode(self, text, *args, **kwargs)
+                    result = result[1:]
+                else:
+                    result = original_encode(self, text, *args, **kwargs)
+                return result
+            object.__setattr__(self.tokenizer, 'encode', encode_wrapper.__get__(self.tokenizer))
+
+            # Since 'encode' is documented as being deprecated, also override __call__.
+            # This doesn't appear to currently be used by KoboldAI, but doing so
+            # in case someone uses it in the future.
+            original_call = type(self.tokenizer.tokenizer).__call__
+            def call_wrapper(self, text, *args, **kwargs):
+                if type(text) is str:
+                    text = ',' + text
+                    result = original_call(self, text, *args, **kwargs)
+                    result = result[1:]
+                else:
+                    result = original_call(self, text, *args, **kwargs)
+                return result
+            object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
+
        elif utils.koboldai_vars.model_type == "opt":
            self.tokenizer._koboldai_header = self.tokenizer.encode("")
            self.tokenizer.add_bos_token = False
--- a/static/koboldai.js
+++ b/static/koboldai.js
@@ -3127,7 +3127,6 @@ function gametextwatcher(records) {
 			}
 		}
 	}
-	//console.log(dirty_chunks);
 }

 function fix_dirty_game_text() {
@@ -3152,16 +3151,26 @@ function fix_dirty_game_text() {
 		//Fixing text outside of chunks
 		for (node of game_text.childNodes) {
 			if ((!(node instanceof HTMLElement) || !node.hasAttribute("chunk")) && (node.textContent.trim() != "")) {
-				console.log("Found Node that needs to be combined");
-				console.log(node);
-				//We have a text only node. It should be moved into the previous chunk
+				//We have a text only node. It should be moved into the previous chunk if it is marked as dirty, next node if not and it's dirty, or the previous if neither is dirty
+				var node_text = ""
 				if (node instanceof HTMLElement) {
-					node.previousElementSibling.innerText = node.previousElementSibling.innerText + node.innerText;
+					node_text = node.innerText;
 				} else {
-					node.previousElementSibling.innerText = node.previousElementSibling.innerText + node.data;
+					node_text = node.data;
 				}
-				if (!dirty_chunks.includes(node.previousElementSibling.getAttribute("chunk"))) {
-					dirty_chunks.push(node.previousElementSibling.getAttribute("chunk"));
+				if (!(node.nextElementSibling) || !(dirty_chunks.includes(node.nextElementSibling.getAttribute("chunk"))) || dirty_chunks.includes(node.previousElementSibling.getAttribute("chunk"))) {
+					node.previousElementSibling.innerText = node.previousElementSibling.innerText + node_text;
+					if (!dirty_chunks.includes(node.previousElementSibling.getAttribute("chunk"))) {
+						dirty_chunks.push(node.previousElementSibling.getAttribute("chunk"));
+					}
+				} else {
+					node.nextElementSibling.innerText = node.nextElementSibling.innerText + node_text;
+				}
+				
+				//Looks like sometimes it splits the parent. Let's look for that and fix it too
+				if (node.nextElementSibling && (node.nextElementSibling.getAttribute("chunk") == node.previousElementSibling.getAttribute("chunk"))) {
+					node.previousElementSibling.innerText = node.previousElementSibling.innerText + node.nextElementSibling.innerText;
+					node.nextElementSibling.remove();
 				}
 				node.remove();
 			}
--- a/utils.py
+++ b/utils.py
@@ -144,21 +144,17 @@ def singlelineprocessing(txt, koboldai_vars):
    return txt

 def chatmodeprocessing(txt, koboldai_vars):
-    chatregex = re.compile(r'%s:[.|\n|\W|\w]*'%koboldai_vars.chatname)
+    chatregex = re.compile(r'\s+%s:[.|\n|\W|\w]*'%koboldai_vars.chatname)
    txt = chatregex.sub('', txt)
    if(len(koboldai_vars.actions) > 0):
        if(len(koboldai_vars.actions[-1]) > 0):
            action = koboldai_vars.actions[-1]
-            lastchar = action[-1] if len(action) else ""
        else:
            # Last action is blank, this should never happen, but
            # since it did let's bail out.
            return txt
    else:
        action = koboldai_vars.prompt
-        lastchar = action[-1] if len(action) else ""
-    if(lastchar != "\n"):
-        txt = txt + "\n"
    return txt

 #==================================================================#
@@ -745,4 +741,4 @@ def applyoutputformatting(txt, no_sentence_trimming=False, no_single_line=False)
            txt = txt.replace(sub["trueTarget"], sub["substitution"])
            txt = txt.replace(sub["target"], sub["substitution"])
    
-    return txt
+    return txt