From 09200856958b6b0d587d0c9d5e85922da38619e4 Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Tue, 31 Jan 2023 21:00:17 +0100
Subject: [PATCH] Experimental EOT Support

---
 aiserver.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/aiserver.py b/aiserver.py
index 49a9ae16..edd3b646 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -6491,10 +6491,14 @@ def applyoutputformatting(txt, no_sentence_trimming=False, no_single_line=False)
     if len(txt) == 0:
         return txt
     
-    # Workaround for endoftext appearing in models that need it, you can supposedly do this directly with the tokenizer but it keeps showing up
-    # So for now since we only have two known end of text tokens and only one model that wishes to have its generation stopped this is easier
-    # If you see this and you wish to do a universal implementation for this, feel free just make sure to test it on all platforms - Henk
-    txt = txt.replace("<|endoftext|>", "")
+    # Handle <|endoftext|> for models that want this
+    # In the future it would be nice if we could extend this to all EOS models.
+    # However, since EOS detection may have unforseen consequences for now we hardcode <|endoftext|> until more can be tested
+    # - Henk
+    eotregex = re.compile(r'<\|endoftext\|>[.|\n|\W|\w]*')
+    txt = eotregex.sub('', txt)
+
+    # Cleanup stray </s>
     txt = txt.replace("</s>", "")
 
     # Use standard quotes and apostrophes