From f112fc3493ece6fd62719b915f407f6a7bff782d Mon Sep 17 00:00:00 2001
From: Henk <henk@henk.tech>
Date: Fri, 17 Jun 2022 13:49:03 +0200
Subject: [PATCH 1/5] Initial flaskwebgui support

---
 aiserver.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 87385bad..419b5930 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -323,6 +323,7 @@ class vars:
     actionmode  = 1
     dynamicscan = False
     host        = False
+    flaskwebgui = False
     nopromptgen = False
     rngpersist  = False
     nogenmod    = False
@@ -2765,6 +2766,8 @@ def do_connect():
     emit('from_server', {'cmd': 'connected', 'smandelete': vars.smandelete, 'smanrename': vars.smanrename, 'modelname': getmodelname()})
     if(vars.host):
         emit('from_server', {'cmd': 'runs_remotely'})
+    if(vars.flaskwebgui):
+        emit('from_server', {'cmd': 'flaskwebgui'})
     if(vars.allowsp):
         emit('from_server', {'cmd': 'allowsp', 'data': vars.allowsp})
 
@@ -5898,15 +5901,26 @@ if __name__ == "__main__":
         vars.serverstarted = True
         socketio.run(app, host='0.0.0.0', port=port)
     else:
-        import webbrowser
-        webbrowser.open_new('http://localhost:{0}'.format(port))
-        print("{0}Server started!\nYou may now connect with a browser at http://127.0.0.1:{1}/{2}"
-              .format(colors.GREEN, port, colors.END))
-        vars.serverstarted = True
         if args.unblock:
+            import webbrowser
+            webbrowser.open_new('http://localhost:{0}'.format(port))
+            print("{0}Server started!\nYou may now connect with a browser at http://127.0.0.1:{1}/{2}"
+                  .format(colors.GREEN, port, colors.END))
+            vars.serverstarted = True
             socketio.run(app, port=port, host='0.0.0.0')
         else:
-            socketio.run(app, port=port)
+            try:
+                from flaskwebgui import FlaskUI
+                vars.serverstarted = True
+                vars.flaskwebgui = True
+                FlaskUI(app, socketio=socketio, start_server="flask-socketio", maximized=True, close_server_on_exit=False).run()
+            except:
+                import webbrowser
+                webbrowser.open_new('http://localhost:{0}'.format(port))
+                print("{0}Server started!\nYou may now connect with a browser at http://127.0.0.1:{1}/{2}"
+                        .format(colors.GREEN, port, colors.END))
+                vars.serverstarted = True
+                socketio.run(app, port=port)
 
 else:
     general_startup()

From 2964175d8b8fbff7991cadd9fc3dbcbf9d07847a Mon Sep 17 00:00:00 2001
From: ebolam <ebolam@gmail.com>
Date: Fri, 17 Jun 2022 08:17:22 -0400
Subject: [PATCH 2/5] Fix for flaskwebgui

---
 aiserver.py             |  4 ++--
 static/flask_web_gui.js | 20 ++++++++++++++++++++
 templates/index.html    |  3 +++
 3 files changed, 25 insertions(+), 2 deletions(-)
 create mode 100644 static/flask_web_gui.js

diff --git a/aiserver.py b/aiserver.py
index 419b5930..84fc1fae 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -2068,7 +2068,7 @@ def index():
     if 'new_ui' in request.args:
         return render_template('index_new.html', hide_ai_menu=args.noaimenu)
     else:
-        return render_template('index.html', hide_ai_menu=args.noaimenu)
+        return render_template('index.html', hide_ai_menu=args.noaimenu, flaskwebgui=vars.flaskwebgui)
 @app.route('/favicon.ico')
 def favicon():
     return send_from_directory(app.root_path,
@@ -5913,7 +5913,7 @@ if __name__ == "__main__":
                 from flaskwebgui import FlaskUI
                 vars.serverstarted = True
                 vars.flaskwebgui = True
-                FlaskUI(app, socketio=socketio, start_server="flask-socketio", maximized=True, close_server_on_exit=False).run()
+                FlaskUI(app, socketio=socketio, start_server="flask-socketio", maximized=True, close_server_on_exit=True).run()
             except:
                 import webbrowser
                 webbrowser.open_new('http://localhost:{0}'.format(port))
diff --git a/static/flask_web_gui.js b/static/flask_web_gui.js
new file mode 100644
index 00000000..8571d84f
--- /dev/null
+++ b/static/flask_web_gui.js
@@ -0,0 +1,20 @@
+async function getRequest(url='') {
+    const response = await fetch(url, {
+      method: 'GET', 
+      cache: 'no-cache'
+    })
+}
+
+document.addEventListener('DOMContentLoaded', function() {
+
+let url = document.location
+let route = "/flaskwebgui-keep-server-alive";
+let interval_request = 3 * 1000; //sec
+
+function keep_alive_server(){
+    getRequest(url + route);
+}
+
+setInterval(keep_alive_server, interval_request);
+
+})
\ No newline at end of file
diff --git a/templates/index.html b/templates/index.html
index 0d77fd49..3f3aa876 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -19,6 +19,9 @@
 	<script src="static/rangy-core.min.js"></script>
 	<script src="static/application.js?ver=1.18.1b"></script>
 	<script src="static/favicon.js"></script>
+	{% if flaskwebgui %}
+	<script src="static/flask_web_gui.js"></script>
+	{% endif %}
 </head>
 <body>
 	<input type="file" id="remote-save-select" accept="application/json" style="display:none">

From f71bae254ab837c9955a5621fa5bb93529f6f15b Mon Sep 17 00:00:00 2001
From: Gnome Ann <>
Date: Fri, 17 Jun 2022 13:29:42 -0400
Subject: [PATCH 3/5] Fix OPT tokenization problems

---
 aiserver.py | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index f1461070..7e21dfe9 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -67,6 +67,17 @@ def new_init(self, *args, **kwargs):
         self.ncols = 99
 tqdm.__init__ = new_init
 
+# Fix some issues with the OPT tokenizer
+from transformers import PreTrainedTokenizerBase
+old_pretrainedtokenizerbase_from_pretrained = PreTrainedTokenizerBase.from_pretrained.__func__
+@classmethod
+def new_pretrainedtokenizerbase_from_pretrained(cls, *args, **kwargs):
+    tokenizer = old_pretrainedtokenizerbase_from_pretrained(cls, *args, **kwargs)
+    tokenizer._koboldai_header = tokenizer.encode("")
+    tokenizer.add_bos_token = False
+    tokenizer.add_prefix_space = False
+    return tokenizer
+PreTrainedTokenizerBase.from_pretrained = new_pretrainedtokenizerbase_from_pretrained
 
 #==================================================================#
 # Variables & Storage
@@ -1697,6 +1708,9 @@ if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "Go
                                 # Then save the pytorch_model-#####-of-#####.bin files
                                 for filename in filenames:
                                     shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, filename, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), filename))
+                        # If the model has a tokenizer_config.json, preserve the original file instead of using the one output by tokenizer.save_pretrained (using the file output by tokenizer.save_pretrained can break OPT-350M in transformers 4.20.0)
+                        if(os.path.isfile(os.path.join("models/{}".format(vars.model.replace('/', '_')), "tokenizer_config.json"))):
+                            shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, "tokenizer_config.json", revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), "tokenizer_config.json"))
                         shutil.rmtree("cache/")
             
             if(vars.hascuda):
@@ -3326,24 +3340,26 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
         global tokenizer
         tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
 
+    lnheader = len(tokenizer._koboldai_header)
+
     # Calculate token budget
     prompttkns = tokenizer.encode(utils.encodenewlines(vars.comregex_ai.sub('', vars.prompt)), max_length=int(2e9), truncation=True)
     lnprompt   = len(prompttkns)
 
     memtokens = tokenizer.encode(utils.encodenewlines(mem), max_length=int(2e9), truncation=True)
     lnmem     = len(memtokens)
-    if(lnmem > vars.max_length - lnsp - vars.genamt - budget_deduction):
+    if(lnmem > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
         raise OverflowError("The memory in your story is too long. Please either write a shorter memory text or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.")
 
     witokens  = tokenizer.encode(utils.encodenewlines(winfo), max_length=int(2e9), truncation=True)
     lnwi      = len(witokens)
-    if(lnmem + lnwi > vars.max_length - lnsp - vars.genamt - budget_deduction):
+    if(lnmem + lnwi > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
         raise OverflowError("The current active world info keys take up too many tokens. Please either write shorter world info, decrease World Info Depth or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.")
 
     if(anotetxt != ""):
         anotetkns = tokenizer.encode(utils.encodenewlines(anotetxt), max_length=int(2e9), truncation=True)
         lnanote   = len(anotetkns)
-        if(lnmem + lnwi + lnanote > vars.max_length - lnsp - vars.genamt - budget_deduction):
+        if(lnmem + lnwi + lnanote > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
             raise OverflowError("The author's note in your story is too long. Please either write a shorter author's note or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt.")
 
     if(vars.useprompt):
@@ -3354,14 +3370,14 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
     lnsubmission = len(tokenizer.encode(utils.encodenewlines(vars.comregex_ai.sub('', submission)), max_length=int(2e9), truncation=True)) if submission is not None else 0
     maybe_lnprompt = lnprompt if vars.useprompt and actionlen > 0 else 0
 
-    if(lnmem + lnwi + lnanote + maybe_lnprompt + lnsubmission > vars.max_length - lnsp - vars.genamt - budget_deduction):
+    if(lnmem + lnwi + lnanote + maybe_lnprompt + lnsubmission > vars.max_length - lnheader - lnsp - vars.genamt - budget_deduction):
         raise OverflowError("Your submission is too long. Please either write a shorter submission or increase the Max Tokens setting. If you are using a soft prompt, additionally consider using a smaller soft prompt. If you are using the Always Add Prompt setting, turning it off may help.")
 
     assert budget >= 0
 
     if(actionlen == 0):
         # First/Prompt action
-        tokens = memtokens + witokens + anotetkns + prompttkns
+        tokens = tokenizer._koboldai_header + memtokens + witokens + anotetkns + prompttkns
         assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction
         ln = len(tokens) + lnsp
         return tokens, ln+1, ln+vars.genamt
@@ -3409,12 +3425,12 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
         # Did we get to add the A.N.? If not, do it here
         if(anotetxt != ""):
             if((not anoteadded) or forceanote):
-                tokens = memtokens + witokens + anotetkns + prompttkns + tokens
+                tokens = tokenizer._koboldai_header + memtokens + witokens + anotetkns + prompttkns + tokens
             else:
-                tokens = memtokens + witokens + prompttkns + tokens
+                tokens = tokenizer._koboldai_header + memtokens + witokens + prompttkns + tokens
         else:
             # Prepend Memory, WI, and Prompt before action tokens
-            tokens = memtokens + witokens + prompttkns + tokens
+            tokens = tokenizer._koboldai_header + memtokens + witokens + prompttkns + tokens
 
         # Send completed bundle to generator
         assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction

From 5e71f7fe976e351dfafd622521a4ec3cc731cef2 Mon Sep 17 00:00:00 2001
From: Gnome Ann <>
Date: Fri, 17 Jun 2022 21:08:37 -0400
Subject: [PATCH 4/5] Use slow tokenizer if fast tokenizer is not available

---
 aiserver.py        | 15 ++++++++++++---
 tpu_mtj_backend.py | 12 ++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 7e21dfe9..ff28db74 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1635,6 +1635,10 @@ if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "Go
                 if(os.path.isdir(vars.custmodpth)):
                     try:
                         tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                    except Exception as e:
+                        pass
+                    try:
+                        tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", use_fast=False)
                     except Exception as e:
                         try:
                             tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
@@ -1647,6 +1651,10 @@ if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "Go
                 elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
                     try:
                         tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
+                    except Exception as e:
+                        pass
+                    try:
+                        tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", use_fast=False)
                     except Exception as e:
                         try:
                             tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
@@ -1672,6 +1680,10 @@ if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "Go
 
                     try:
                         tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
+                    except Exception as e:
+                        pass
+                    try:
+                        tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", use_fast=False)
                     except Exception as e:
                         try:
                             tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
@@ -1708,9 +1720,6 @@ if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "Go
                                 # Then save the pytorch_model-#####-of-#####.bin files
                                 for filename in filenames:
                                     shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, filename, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), filename))
-                        # If the model has a tokenizer_config.json, preserve the original file instead of using the one output by tokenizer.save_pretrained (using the file output by tokenizer.save_pretrained can break OPT-350M in transformers 4.20.0)
-                        if(os.path.isfile(os.path.join("models/{}".format(vars.model.replace('/', '_')), "tokenizer_config.json"))):
-                            shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, "tokenizer_config.json", revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), "tokenizer_config.json"))
                         shutil.rmtree("cache/")
             
             if(vars.hascuda):
diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py
index 67e006d6..bc228998 100644
--- a/tpu_mtj_backend.py
+++ b/tpu_mtj_backend.py
@@ -1324,6 +1324,10 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
         if(os.path.isdir(vars.custmodpth)):
             try:
                 tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+            except Exception as e:
+                pass
+            try:
+                tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", use_fast=False)
             except Exception as e:
                 try:
                     tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
@@ -1336,6 +1340,10 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
         elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
             try:
                 tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
+            except Exception as e:
+                pass
+            try:
+                tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", use_fast=False)
             except Exception as e:
                 try:
                     tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
@@ -1348,6 +1356,10 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
         else:
             try:
                 tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
+            except Exception as e:
+                pass
+            try:
+                tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", use_fast=False)
             except Exception as e:
                 try:
                     tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")

From b209cf98683484958735c1ab32ac467f97eb97a1 Mon Sep 17 00:00:00 2001
From: henk717 <henk@henk.tech>
Date: Sat, 18 Jun 2022 19:46:16 +0200
Subject: [PATCH 5/5] NS mode as default

Experimental change that makes NS the default, more and more models seem to be requiring this as megatron based models are getting traction, neither does this seem to break the original models (with the exception of a user not being able to use </s> in generated outputs, the extremely rare case someone would be effected by this they can manually switch the mode by editing their settings file).

If this breaks nothing ns will remain the default, however the n mode should remain a choice for those who need it. In case it does get reversed I have also added the bloom model type to the ns list since its models require this.
---
 aiserver.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 81af6f31..c43f9f0f 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -339,7 +339,7 @@ class vars:
     rngpersist  = False
     nogenmod    = False
     welcome     = False  # Custom Welcome Text (False is default)
-    newlinemode = "n"
+    newlinemode = "ns"
     quiet       = False # If set will suppress any story text from being printed to the console (will only be seen on the client web page)
     debug       = False # If set to true, will send debug information to the client for display
     lazy_load   = True  # Whether or not to use torch_lazy_loader.py for transformers models in order to reduce CPU memory usage
@@ -661,7 +661,7 @@ def loadmodelsettings():
             js   = {}
     if vars.model_type == "xglm" or js.get("compat", "j") == "fairseq_lm":
         vars.newlinemode = "s"  # Default to </s> newline mode if using XGLM
-    if vars.model_type == "opt":
+    if vars.model_type == "opt" or vars.model_type == "bloom":
         vars.newlinemode = "ns"  # Handle </s> but don't convert newlines if using Fairseq models that have newlines trained in them
     vars.modelconfig = js
     if("badwordsids" in js):