Fix tokenization and whitespace issues with llama-derived models

Work around the 'soft' prefix space behavior of sentencepiece.
Override encode to restore the deleted HF support for decode_with_prefix_space.
Override decode to skip the soft space and return true decoded tokens.
Allow submitting chat messages with embedded newlines.
Split sentences between punctuation and whitespace, rather than after whitespace.
Also include trailing quotes and brackets after sentence stoppers.
This avoids splitting ." and .) into two tokens, for instance.
Insert whitespace at the beginning of the author's note, since sentences are
split with leading whitespace.
Remove spurious newlines at the end of chat responses.
This commit is contained in:
Llama
2023-05-03 01:27:11 -07:00
parent 507da6fcf7
commit 3768848548
4 changed files with 94 additions and 11 deletions

View File

@@ -144,21 +144,17 @@ def singlelineprocessing(txt, koboldai_vars):
return txt
def chatmodeprocessing(txt, koboldai_vars):
chatregex = re.compile(r'%s:[.|\n|\W|\w]*'%koboldai_vars.chatname)
chatregex = re.compile(r'\s+%s:[.|\n|\W|\w]*'%koboldai_vars.chatname)
txt = chatregex.sub('', txt)
if(len(koboldai_vars.actions) > 0):
if(len(koboldai_vars.actions[-1]) > 0):
action = koboldai_vars.actions[-1]
lastchar = action[-1] if len(action) else ""
else:
# Last action is blank, this should never happen, but
# since it did let's bail out.
return txt
else:
action = koboldai_vars.prompt
lastchar = action[-1] if len(action) else ""
if(lastchar != "\n"):
txt = txt + "\n"
return txt
#==================================================================#
@@ -745,4 +741,4 @@ def applyoutputformatting(txt, no_sentence_trimming=False, no_single_line=False)
txt = txt.replace(sub["trueTarget"], sub["substitution"])
txt = txt.replace(sub["target"], sub["substitution"])
return txt
return txt