diff --git a/aiserver.py b/aiserver.py
index f1461070..ff49ae9d 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -45,6 +45,7 @@ import sys
 import gc
 
 import lupa
+import importlib
 
 # KoboldAI
 import fileops
@@ -52,11 +53,22 @@ import gensettings
 from utils import debounce
 import utils
 import structures
+import torch
+from transformers import StoppingCriteria, GPT2TokenizerFast, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel, modeling_utils
+from transformers import __version__ as transformers_version
+import transformers
+try:
+    from transformers.models.opt.modeling_opt import OPTDecoder
+except:
+    pass
+import transformers.generation_utils
+global tpu_mtj_backend
 
 
 if lupa.LUA_VERSION[:2] != (5, 4):
     print(f"Please install lupa==1.10. You have lupa {lupa.__version__}.", file=sys.stderr)
 
+patch_causallm_patched = False
 
 # Make sure tqdm progress bars display properly in Colab
 from tqdm.auto import tqdm
@@ -83,115 +95,111 @@ class colors:
     END       = '\033[0m'
     UNDERLINE = '\033[4m'
 
-# AI models
-mainmenu = [
-    ["Load a model from its directory", "NeoCustom", ""],
-    ["Load an old GPT-2 model (eg CloverEdition)", "GPT2Custom", ""],
-    ["Adventure Models", "adventurelist", ""],
-    ["Novel Models", "novellist", ""],
-    ["NSFW Models", "nsfwlist", ""],
-    ["Untuned GPT-Neo/J", "gptneolist", ""],
-    ["Untuned Fairseq Dense", "fsdlist", ""],
-    ["Untuned OPT", "optlist", ""],
-    ["Untuned XGLM", "xglmlist", ""],
-    ["Untuned GPT2", "gpt2list", ""],
-    ["Online Services", "apilist", ""],
-    ["Read Only (No AI)", "ReadOnly", ""]
+# AI models Menu
+# This is a dict of lists where they key is the menu name, and the list is the menu items.
+# Each item takes the 4 elements, 1: Text to display, 2: Model Name (var.model) or menu name (Key name for another menu),
+# 3: the memory requirement for the model, 4: if the item is a menu or not (True/False)
+model_menu = {
+    'mainmenu': [
+        ["Load a model from its directory", "NeoCustom", "", False],
+        ["Load an old GPT-2 model (eg CloverEdition)", "GPT2Custom", "", False],
+        ["Adventure Models", "adventurelist", "", True],
+        ["Novel Models", "novellist", "", True],
+        ["NSFW Models", "nsfwlist", "", True],
+        ["Untuned GPT-Neo/J", "gptneolist", "", True],
+        ["Untuned Fairseq Dense", "fsdlist", "", True],
+        ["Untuned OPT", "optlist", "", True],
+        ["Untuned XGLM", "xglmlist", "", True],
+        ["Untuned GPT2", "gpt2list", "", True],
+        ["Online Services", "apilist", "", True],
+        ["Read Only (No AI)", "ReadOnly", "", False]
+        ],
+    'adventurelist': [
+        ["Nerys FSD 13B (Hybrid)", "KoboldAI/fairseq-dense-13B-Nerys", "32GB", False],
+        ["Skein 6B", "KoboldAI/GPT-J-6B-Skein", "16GB", False],
+        ["Adventure 6B", "KoboldAI/GPT-J-6B-Adventure", "16GB", False],
+        ["Nerys FSD 2.7B (Hybrid)", "KoboldAI/fairseq-dense-2.7B-Nerys", "8GB", False],
+        ["Adventure 2.7B", "KoboldAI/GPT-Neo-2.7B-AID", "8GB", False],
+        ["Adventure 1.3B", "KoboldAI/GPT-Neo-1.3B-Adventure", "6GB", False],
+        ["Adventure 125M (Mia)", "Merry/AID-Neo-125M", "2GB", False],
+        ["Return to Main Menu", "mainmenu", "", True],
+        ],
+    'novellist': [
+        ["Nerys FSD 13B (Hybrid)", "KoboldAI/fairseq-dense-13B-Nerys", "32GB", False],
+        ["Janeway FSD 13B", "KoboldAI/fairseq-dense-13B-Janeway", "32GB", False],
+        ["Janeway FSD 6.7B", "KoboldAI/fairseq-dense-6.7B-Janeway", "16GB", False],
+        ["Janeway Neo 6B", "KoboldAI/GPT-J-6B-Janeway", "16GB", False],
+        ["Janeway Neo 2.7B", "KoboldAI/GPT-Neo-2.7B-Janeway", "8GB", False],
+        ["Janeway FSD 2.7B", "KoboldAI/fairseq-dense-2.7B-Janeway", "8GB", False],
+        ["Nerys FSD 2.7B (Hybrid)", "KoboldAI/fairseq-dense-2.7B-Nerys", "8GB", False],
+        ["Horni-LN 2.7B", "KoboldAI/GPT-Neo-2.7B-Horni-LN", "8GB", False],
+        ["Picard 2.7B (Older Janeway)", "KoboldAI/GPT-Neo-2.7B-Picard", "8GB", False],
+        ["Return to Main Menu", "mainmenu", "", True],
+        ],
+    'nsfwlist': [
+        ["Shinen FSD 13B (NSFW)", "KoboldAI/fairseq-dense-13B-Shinen", "32GB", False],
+        ["Shinen FSD 6.7B (NSFW)", "KoboldAI/fairseq-dense-6.7B-Shinen", "16GB", False],
+        ["Lit 6B (NSFW)", "hakurei/lit-6B", "16GB", False],
+        ["Shinen 6B (NSFW)", "KoboldAI/GPT-J-6B-Shinen", "16GB", False],
+        ["Horni 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Horni", "8GB", False],
+        ["Shinen 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Shinen", "8GB", False],
+        ["Return to Main Menu", "mainmenu", "", True],
+        ],
+    'chatlist': [
+        ["Convo 6B (Chatbot)", "hitomi-team/convo-6B", "16GB", False],
+        ["C1 6B (Chatbot)", "hakurei/c1-6B", "16GB", False],
+        ["C1 1.3B (Chatbot)", "iokru/c1-1.3B", "6GB", False],
+        ["Return to Main Menu", "mainmenu", "", True],
+        ],
+    'gptneolist': [
+        ["GPT-J 6B", "EleutherAI/gpt-j-6B", "16GB", False],
+        ["GPT-Neo 2.7B", "EleutherAI/gpt-neo-2.7B", "8GB", False],
+        ["GPT-Neo 1.3B", "EleutherAI/gpt-neo-1.3B", "6GB", False],
+        ["GPT-Neo 125M", "EleutherAI/gpt-neo-125M", "2GB", False],
+        ["Return to Main Menu", "mainmenu", "", True],
+        ],
+    'gpt2list': [
+        ["GPT-2 XL", "gpt2-xl", "6GB", False],
+        ["GPT-2 Large", "gpt2-large", "4GB", False],
+        ["GPT-2 Med", "gpt2-medium", "2GB", False],
+        ["GPT-2", "gpt2", "2GB", False],
+        ["Return to Main Menu", "mainmenu", "", True],
+        ],
+    'optlist': [
+        ["OPT 30B", "facebook/opt-30b", "64GB", False],
+        ["OPT 13B", "facebook/opt-13b", "32GB", False],
+        ["OPT 6.7B", "facebook/opt-6.7b", "16GB", False],
+        ["OPT 2.7B", "facebook/opt-2.7b", "8GB", False],
+        ["OPT 1.3B", "facebook/opt-1.3b", "4GB", False],
+        ["OPT 350M", "facebook/opt-350m", "2GB", False],
+        ["OPT 125M", "facebook/opt-125m", "1GB", False],
+        ["Return to Main Menu", "mainmenu", "", True],
+        ],
+    'fsdlist': [
+        ["Fairseq Dense 13B", "KoboldAI/fairseq-dense-13B", "32GB", False],
+        ["Fairseq Dense 6.7B", "KoboldAI/fairseq-dense-6.7B", "16GB", False],
+        ["Fairseq Dense 2.7B", "KoboldAI/fairseq-dense-2.7B", "8GB", False],
+        ["Fairseq Dense 1.3B", "KoboldAI/fairseq-dense-1.3B", "4GB", False],
+        ["Fairseq Dense 355M", "KoboldAI/fairseq-dense-355M", "2GB", False],
+        ["Fairseq Dense 125M", "KoboldAI/fairseq-dense-125M", "1GB", False],
+        ["Return to Main Menu", "mainmenu", "", True],
+        ],
+    'xglmlist': [
+        ["XGLM 4.5B (Larger Dataset)", "facebook/xglm-4.5B", "12GB", False],
+        ["XGLM 7.5B", "facebook/xglm-7.5B", "18GB", False],
+        ["XGLM 2.9B", "facebook/xglm-2.9B", "10GB", False],
+        ["XGLM 1.7B", "facebook/xglm-1.7B", "6GB", False],
+        ["XGLM 564M", "facebook/xglm-564M", "4GB", False],
+        ["Return to Main Menu", "mainmenu", "", True],
+        ],
+    'apilist': [
+        ["GooseAI API (requires API key)", "GooseAI", "", False],
+        ["OpenAI API (requires API key)", "OAI", "", False],
+        ["InferKit API (requires API key)", "InferKit", "", False],
+        ["KoboldAI Server API (Old Google Colab)", "Colab", "", False],
+        ["Return to Main Menu", "mainmenu", "", True],
     ]
-
-adventurelist= [
-    ["Nerys FSD 13B (Hybrid)", "KoboldAI/fairseq-dense-13B-Nerys", "32GB"],
-    ["Skein 6B", "KoboldAI/GPT-J-6B-Skein", "16GB"],
-    ["Adventure 6B", "KoboldAI/GPT-J-6B-Adventure", "16GB"],
-    ["Nerys FSD 2.7B (Hybrid)", "KoboldAI/fairseq-dense-2.7B-Nerys", "8GB"],
-    ["Adventure 2.7B", "KoboldAI/GPT-Neo-2.7B-AID", "8GB"],
-    ["Adventure 1.3B", "KoboldAI/GPT-Neo-1.3B-Adventure", "6GB"],
-    ["Adventure 125M (Mia)", "Merry/AID-Neo-125M", "2GB"],
-    ["Return to Main Menu", "Return", ""],
-]
-
-novellist= [
-    ["Nerys FSD 13B (Hybrid)", "KoboldAI/fairseq-dense-13B-Nerys", "32GB"],
-    ["Janeway FSD 13B", "KoboldAI/fairseq-dense-13B-Janeway", "32GB"],
-    ["Janeway FSD 6.7B", "KoboldAI/fairseq-dense-6.7B-Janeway", "16GB"],
-    ["Janeway Neo 6B", "KoboldAI/GPT-J-6B-Janeway", "16GB"],
-    ["Janeway Neo 2.7B", "KoboldAI/GPT-Neo-2.7B-Janeway", "8GB"],
-    ["Janeway FSD 2.7B", "KoboldAI/fairseq-dense-2.7B-Janeway", "8GB"],
-    ["Nerys FSD 2.7B (Hybrid)", "KoboldAI/fairseq-dense-2.7B-Nerys", "8GB"],
-    ["Horni-LN 2.7B", "KoboldAI/GPT-Neo-2.7B-Horni-LN", "8GB"],
-    ["Picard 2.7B (Older Janeway)", "KoboldAI/GPT-Neo-2.7B-Picard", "8GB"],
-    ["Return to Main Menu", "Return", ""],
-]
-
-nsfwlist= [
-    ["Shinen FSD 13B (NSFW)", "KoboldAI/fairseq-dense-13B-Shinen", "32GB"],
-    ["Shinen FSD 6.7B (NSFW)", "KoboldAI/fairseq-dense-6.7B-Shinen", "16GB"],
-    ["Lit 6B (NSFW)", "hakurei/lit-6B", "16GB"],
-    ["Shinen 6B (NSFW)", "KoboldAI/GPT-J-6B-Shinen", "16GB"],
-    ["Horni 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Horni", "8GB"],
-    ["Shinen 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Shinen", "8GB"],
-    ["Return to Main Menu", "Return", ""],
-]
-
-chatlist= [
-    ["Convo 6B (Chatbot)", "hitomi-team/convo-6B", "16GB"],
-    ["C1 6B (Chatbot)", "hakurei/c1-6B", "16GB"],
-    ["C1 1.3B (Chatbot)", "iokru/c1-1.3B", "6GB"],
-    ["Return to Main Menu", "Return", ""],
-]
-gptneolist = [
-    ["GPT-J 6B", "EleutherAI/gpt-j-6B", "16GB"],
-    ["GPT-Neo 2.7B", "EleutherAI/gpt-neo-2.7B", "8GB"],
-    ["GPT-Neo 1.3B", "EleutherAI/gpt-neo-1.3B", "6GB"],
-    ["GPT-Neo 125M", "EleutherAI/gpt-neo-125M", "2GB"],
-    ["Return to Main Menu", "Return", ""],
-]
-
-gpt2list = [
-    ["GPT-2 XL", "gpt2-xl", "6GB"],
-    ["GPT-2 Large", "gpt2-large", "4GB"],
-    ["GPT-2 Med", "gpt2-medium", "2GB"],
-    ["GPT-2", "gpt2", "2GB"],
-    ["Return to Main Menu", "Return", ""],
-    ]
-
-optlist = [
-    ["OPT 30B", "facebook/opt-30b", "64GB"],
-    ["OPT 13B", "facebook/opt-13b", "32GB"],
-    ["OPT 6.7B", "facebook/opt-6.7b", "16GB"],
-    ["OPT 2.7B", "facebook/opt-2.7b", "8GB"],
-    ["OPT 1.3B", "facebook/opt-1.3b", "4GB"],
-    ["OPT 350M", "facebook/opt-350m", "2GB"],
-    ["OPT 125M", "facebook/opt-125m", "1GB"],
-    ["Return to Main Menu", "Return", ""],
-    ]
-
-fsdlist = [
-    ["Fairseq Dense 13B", "KoboldAI/fairseq-dense-13B", "32GB"],
-    ["Fairseq Dense 6.7B", "KoboldAI/fairseq-dense-6.7B", "16GB"],
-    ["Fairseq Dense 2.7B", "KoboldAI/fairseq-dense-2.7B", "8GB"],
-    ["Fairseq Dense 1.3B", "KoboldAI/fairseq-dense-1.3B", "4GB"],
-    ["Fairseq Dense 355M", "KoboldAI/fairseq-dense-355M", "2GB"],
-    ["Fairseq Dense 125M", "KoboldAI/fairseq-dense-125M", "1GB"],
-    ["Return to Main Menu", "Return", ""],
-    ]
-
-xglmlist = [
-    ["XGLM 4.5B (Larger Dataset)", "facebook/xglm-4.5B", "12GB"],
-    ["XGLM 7.5B", "facebook/xglm-7.5B", "18GB"],
-    ["XGLM 2.9B", "facebook/xglm-2.9B", "10GB"],
-    ["XGLM 1.7B", "facebook/xglm-1.7B", "6GB"],
-    ["XGLM 564M", "facebook/xglm-564M", "4GB"],
-    ["Return to Main Menu", "Return", ""],
-    ]
-
-apilist = [
-    ["GooseAI API (requires API key)", "GooseAI", ""],
-    ["OpenAI API (requires API key)", "OAI", ""],
-    ["InferKit API (requires API key)", "InferKit", ""],
-    ["KoboldAI Server API (Old Google Colab)", "Colab", ""],
-    ["Return to Main Menu", "Return", ""],
-]
+    }
 # Variables
 class vars:
     lastact     = ""     # The last action received from the user
@@ -256,7 +264,8 @@ class vars:
     last_userscripts = []  # List of previous userscript filenames from the previous time userscripts were send via usstatitems
     corescript  = "default.lua"  # Filename of corescript to load
     # badwords    = []     # Array of str/chr values that should be removed from output
-    badwordsids = [[13460], [6880], [50256], [42496], [4613], [17414], [22039], [16410], [27], [29], [38430], [37922], [15913], [24618], [28725], [58], [47175], [36937], [26700], [12878], [16471], [37981], [5218], [29795], [13412], [45160], [3693], [49778], [4211], [20598], [36475], [33409], [44167], [32406], [29847], [29342], [42669], [685], [25787], [7359], [3784], [5320], [33994], [33490], [34516], [43734], [17635], [24293], [9959], [23785], [21737], [28401], [18161], [26358], [32509], [1279], [38155], [18189], [26894], [6927], [14610], [23834], [11037], [14631], [26933], [46904], [22330], [25915], [47934], [38214], [1875], [14692], [41832], [13163], [25970], [29565], [44926], [19841], [37250], [49029], [9609], [44438], [16791], [17816], [30109], [41888], [47527], [42924], [23984], [49074], [33717], [31161], [49082], [30138], [31175], [12240], [14804], [7131], [26076], [33250], [3556], [38381], [36338], [32756], [46581], [17912], [49146]] # Tokenized array of badwords used to prevent AI artifacting
+    badwordsids = []
+    badwordsids_default = [[13460], [6880], [50256], [42496], [4613], [17414], [22039], [16410], [27], [29], [38430], [37922], [15913], [24618], [28725], [58], [47175], [36937], [26700], [12878], [16471], [37981], [5218], [29795], [13412], [45160], [3693], [49778], [4211], [20598], [36475], [33409], [44167], [32406], [29847], [29342], [42669], [685], [25787], [7359], [3784], [5320], [33994], [33490], [34516], [43734], [17635], [24293], [9959], [23785], [21737], [28401], [18161], [26358], [32509], [1279], [38155], [18189], [26894], [6927], [14610], [23834], [11037], [14631], [26933], [46904], [22330], [25915], [47934], [38214], [1875], [14692], [41832], [13163], [25970], [29565], [44926], [19841], [37250], [49029], [9609], [44438], [16791], [17816], [30109], [41888], [47527], [42924], [23984], [49074], [33717], [31161], [49082], [30138], [31175], [12240], [14804], [7131], [26076], [33250], [3556], [38381], [36338], [32756], [46581], [17912], [49146]] # Tokenized array of badwords used to prevent AI artifacting
     badwordsids_neox = [[0], [1], [44162], [9502], [12520], [31841], [36320], [49824], [34417], [6038], [34494], [24815], [26635], [24345], [3455], [28905], [44270], [17278], [32666], [46880], [7086], [43189], [37322], [17778], [20879], [49821], [3138], [14490], [4681], [21391], [26786], [43134], [9336], [683], [48074], [41256], [19181], [29650], [28532], [36487], [45114], [46275], [16445], [15104], [11337], [1168], [5647], [29], [27482], [44965], [43782], [31011], [42944], [47389], [6334], [17548], [38329], [32044], [35487], [2239], [34761], [7444], [1084], [12399], [18990], [17636], [39083], [1184], [35830], [28365], [16731], [43467], [47744], [1138], [16079], [40116], [45564], [18297], [42368], [5456], [18022], [42696], [34476], [23505], [23741], [39334], [37944], [45382], [38709], [33440], [26077], [43600], [34418], [36033], [6660], [48167], [48471], [15775], [19884], [41533], [1008], [31053], [36692], [46576], [20095], [20629], [31759], [46410], [41000], [13488], [30952], [39258], [16160], [27655], [22367], [42767], [43736], [49694], [13811], [12004], [46768], [6257], [37471], [5264], [44153], [33805], [20977], [21083], [25416], [14277], [31096], [42041], [18331], [33376], [22372], [46294], [28379], [38475], [1656], [5204], [27075], [50001], [16616], [11396], [7748], [48744], [35402], [28120], [41512], [4207], [43144], [14767], [15640], [16595], [41305], [44479], [38958], [18474], [22734], [30522], [46267], [60], [13976], [31830], [48701], [39822], [9014], [21966], [31422], [28052], [34607], [2479], [3851], [32214], [44082], [45507], [3001], [34368], [34758], [13380], [38363], [4299], [46802], [30996], [12630], [49236], [7082], [8795], [5218], [44740], [9686], [9983], [45301], [27114], [40125], [1570], [26997], [544], [5290], [49193], [23781], [14193], [40000], [2947], [43781], [9102], [48064], [42274], [18772], [49384], [9884], [45635], [43521], [31258], [32056], [47686], [21760], [13143], [10148], [26119], [44308], [31379], [36399], [23983], [46694], [36134], [8562], [12977], [35117], [28591], [49021], [47093], [28653], [29013], [46468], [8605], [7254], [25896], [5032], [8168], [36893], [38270], [20499], [27501], [34419], [29547], [28571], [36586], [20871], [30537], [26842], [21375], [31148], [27618], [33094], [3291], [31789], [28391], [870], [9793], [41361], [47916], [27468], [43856], [8850], [35237], [15707], [47552], [2730], [41449], [45488], [3073], [49806], [21938], [24430], [22747], [20924], [46145], [20481], [20197], [8239], [28231], [17987], [42804], [47269], [29972], [49884], [21382], [46295], [36676], [34616], [3921], [26991], [27720], [46265], [654], [9855], [40354], [5291], [34904], [44342], [2470], [14598], [880], [19282], [2498], [24237], [21431], [16369], [8994], [44524], [45662], [13663], [37077], [1447], [37786], [30863], [42854], [1019], [20322], [4398], [12159], [44072], [48664], [31547], [18736], [9259], [31], [16354], [21810], [4357], [37982], [5064], [2033], [32871], [47446], [62], [22158], [37387], [8743], [47007], [17981], [11049], [4622], [37916], [36786], [35138], [29925], [14157], [18095], [27829], [1181], [22226], [5709], [4725], [30189], [37014], [1254], [11380], [42989], [696], [24576], [39487], [30119], [1092], [8088], [2194], [9899], [14412], [21828], [3725], [13544], [5180], [44679], [34398], [3891], [28739], [14219], [37594], [49550], [11326], [6904], [17266], [5749], [10174], [23405], [9955], [38271], [41018], [13011], [48392], [36784], [24254], [21687], [23734], [5413], [41447], [45472], [10122], [17555], [15830], [47384], [12084], [31350], [47940], [11661], [27988], [45443], [905], [49651], [16614], [34993], [6781], [30803], [35869], [8001], [41604], [28118], [46462], [46762], [16262], [17281], [5774], [10943], [5013], [18257], [6750], [4713], [3951], [11899], [38791], [16943], [37596], [9318], [18413], [40473], [13208], [16375]]
     badwordsids_opt = [[44717], [46613], [48513], [49923], [50185], [48755], [8488], [43303], [49659], [48601], [49817], [45405], [48742], [49925], [47720], [11227], [48937], [48784], [50017], [42248], [49310], [48082], [49895], [50025], [49092], [49007], [8061], [44226], [0], [742], [28578], [15698], [49784], [46679], [39365], [49281], [49609], [48081], [48906], [46161], [48554], [49670], [48677], [49721], [49632], [48610], [48462], [47457], [10975], [46077], [28696], [48709], [43839], [49798], [49154], [48203], [49625], [48395], [50155], [47161], [49095], [48833], [49420], [49666], [48443], [22176], [49242], [48651], [49138], [49750], [40389], [48021], [21838], [49070], [45333], [40862], [1], [49915], [33525], [49858], [50254], [44403], [48992], [48872], [46117], [49853], [47567], [50206], [41552], [50068], [48999], [49703], [49940], [49329], [47620], [49868], [49962], [2], [44082], [50236], [31274], [50260], [47052], [42645], [49177], [17523], [48691], [49900], [49069], [49358], [48794], [47529], [46479], [48457], [646], [49910], [48077], [48935], [46386], [48902], [49151], [48759], [49803], [45587], [48392], [47789], [48654], [49836], [49230], [48188], [50264], [46844], [44690], [48505], [50161], [27779], [49995], [41833], [50154], [49097], [48520], [50018], [8174], [50084], [49366], [49526], [50193], [7479], [49982], [3]]
     fp32_model  = False  # Whether or not the most recently loaded HF model was in fp32 format
@@ -272,7 +281,7 @@ class vars:
     colaburl    = ""     # Ngrok url for Google Colab mode
     apikey      = ""     # API key to use for InferKit API calls
     oaiapikey   = ""     # API key to use for OpenAI API calls
-    savedir     = getcwd()+"\stories"
+    savedir     = getcwd()+"\\stories"
     hascuda     = False  # Whether torch has detected CUDA on the system
     usegpu      = False  # Whether to launch pipeline with GPU support
     custmodpth  = ""     # Filesystem location of custom model to run
@@ -322,12 +331,79 @@ class vars:
     debug       = False # If set to true, will send debug information to the client for display
     lazy_load   = True  # Whether or not to use torch_lazy_loader.py for transformers models in order to reduce CPU memory usage
     use_colab_tpu = os.environ.get("COLAB_TPU_ADDR", "") != "" or os.environ.get("TPU_NAME", "") != ""  # Whether or not we're in a Colab TPU instance or Kaggle TPU instance and are going to use the TPU rather than the CPU
+    revision    = None
 
 utils.vars = vars
 
+class Send_to_socketio(object):
+    def write(self, bar):
+        print(bar, end="")
+        time.sleep(0.01)
+        try:
+            emit('from_server', {'cmd': 'model_load_status', 'data': bar.replace(" ", "&nbsp;")}, broadcast=True)
+        except:
+            pass
+                                
+# Set logging level to reduce chatter from Flask
+import logging
+log = logging.getLogger('werkzeug')
+log.setLevel(logging.ERROR)
+
+# Start flask & SocketIO
+print("{0}Initializing Flask... {1}".format(colors.PURPLE, colors.END), end="")
+from flask import Flask, render_template, Response, request, copy_current_request_context, send_from_directory
+from flask_socketio import SocketIO, emit
+app = Flask(__name__, root_path=os.getcwd())
+app.config['SECRET KEY'] = 'secret!'
+app.config['TEMPLATES_AUTO_RELOAD'] = True
+socketio = SocketIO(app, async_method="eventlet")
+print("{0}OK!{1}".format(colors.GREEN, colors.END))
+
 #==================================================================#
 # Function to get model selection at startup
 #==================================================================#
+def sendModelSelection(menu="mainmenu", folder="./models"):
+    #If we send one of the manual load options, send back the list of model directories, otherwise send the menu
+    if menu in ('NeoCustom', 'GPT2Custom'):
+        (paths, breadcrumbs) = get_folder_path_info(folder)
+        if args.remote:
+            breadcrumbs = []
+        menu_list = [[folder, menu, "", False] for folder in paths]
+        menu_list.append(["Return to Main Menu", "mainmenu", "", True])
+        if os.path.abspath("{}/models".format(os.getcwd())) == os.path.abspath(folder):
+            showdelete=True
+        else:
+            showdelete=False
+        emit('from_server', {'cmd': 'show_model_menu', 'data': menu_list, 'menu': menu, 'breadcrumbs': breadcrumbs, "showdelete": showdelete}, broadcast=True)
+    else:
+        emit('from_server', {'cmd': 'show_model_menu', 'data': model_menu[menu], 'menu': menu, 'breadcrumbs': [], "showdelete": False}, broadcast=True)
+
+def get_folder_path_info(base):
+    if base == 'This PC':
+        breadcrumbs = [['This PC', 'This PC']]
+        paths = [["{}:\\".format(chr(i)), "{}:\\".format(chr(i))] for i in range(65, 91) if os.path.exists("{}:".format(chr(i)))]
+    else:
+        path = os.path.abspath(base)
+        if path[-1] == "\\":
+            path = path[:-1]
+        breadcrumbs = []
+        for i in range(len(path.replace("/", "\\").split("\\"))):
+            breadcrumbs.append(["\\".join(path.replace("/", "\\").split("\\")[:i+1]),
+                                 path.replace("/", "\\").split("\\")[i]])
+        if len(breadcrumbs) == 1:
+            breadcrumbs = [["{}:\\".format(chr(i)), "{}:\\".format(chr(i))] for i in range(65, 91) if os.path.exists("{}:".format(chr(i)))]
+        else:
+            if len([["{}:\\".format(chr(i)), "{}:\\".format(chr(i))] for i in range(65, 91) if os.path.exists("{}:".format(chr(i)))]) > 0:
+                breadcrumbs.insert(0, ['This PC', 'This PC'])
+        paths = []
+        base_path = os.path.abspath(base)
+        for item in os.listdir(base_path):
+            if os.path.isdir(os.path.join(base_path, item)):
+                paths.append([os.path.join(base_path, item), item])
+    # Paths/breadcrumbs is a list of lists, where the first element in the sublist is the full path and the second is the folder name
+    return (paths, breadcrumbs)
+
+
 def getModelSelection(modellist):
     print("    #    Model\t\t\t\t\t\tVRAM\n    ========================================================")
     i = 1
@@ -350,7 +426,7 @@ def getModelSelection(modellist):
     except Exception as e:
         if(vars.model == "Return"):
             getModelSelection(mainmenu)
-
+                
         # If custom model was selected, get the filesystem location and store it
         if(vars.model == "NeoCustom" or vars.model == "GPT2Custom"):
             print("{0}Please choose the folder where pytorch_model.bin is located:{1}\n".format(colors.CYAN, colors.END))
@@ -365,6 +441,17 @@ def getModelSelection(modellist):
                 print("{0}Select an AI model to continue:{1}\n".format(colors.CYAN, colors.END))
                 getModelSelection(mainmenu)
 
+def check_if_dir_is_model(path):
+    if os.path.exists(path):
+        try:
+            from transformers import AutoConfig
+            model_config = AutoConfig.from_pretrained(path, revision=vars.revision, cache_dir="cache")
+        except:
+            return False
+        return True
+    else:
+        return False
+    
 #==================================================================#
 # Return all keys in tokenizer dictionary containing char
 #==================================================================#
@@ -776,6 +863,8 @@ def check_for_sp_change():
                 emit('from_server', {'cmd': 'spstatitems', 'data': {vars.spfilename: vars.spmeta} if vars.allowsp and len(vars.spfilename) else {}}, namespace=None, broadcast=True)
             vars.sp_changed = False
 
+socketio.start_background_task(check_for_sp_change)
+
 def spRequest(filename):
     if(not vars.allowsp):
         raise RuntimeError("Soft prompts are not supported by your current model/backend")
@@ -840,182 +929,666 @@ def spRequest(filename):
 #==================================================================#
 # Startup
 #==================================================================#
-
-# Parsing Parameters
-parser = argparse.ArgumentParser(description="KoboldAI Server")
-parser.add_argument("--remote", action='store_true', help="Optimizes KoboldAI for Remote Play")
-parser.add_argument("--ngrok", action='store_true', help="Optimizes KoboldAI for Remote Play using Ngrok")
-parser.add_argument("--localtunnel", action='store_true', help="Optimizes KoboldAI for Remote Play using Localtunnel")
-parser.add_argument("--host", action='store_true', help="Optimizes KoboldAI for Remote Play without using a proxy service")
-parser.add_argument("--port", type=int, help="Specify the port on which the application will be joinable")
-parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)")
-parser.add_argument("--model", help="Specify the Model Type to skip the Menu")
-parser.add_argument("--path", help="Specify the Path for local models (For model NeoCustom or GPT2Custom)")
-parser.add_argument("--revision", help="Specify the model revision for huggingface models (can be a git branch/tag name or a git commit hash)")
-parser.add_argument("--cpu", action='store_true', help="By default unattended launches are on the GPU use this option to force CPU usage.")
-parser.add_argument("--breakmodel", action='store_true', help=argparse.SUPPRESS)
-parser.add_argument("--breakmodel_layers", type=int, help=argparse.SUPPRESS)
-parser.add_argument("--breakmodel_gpulayers", type=str, help="If using a model that supports hybrid generation, this is a comma-separated list that specifies how many layers to put on each GPU device. For example to put 8 layers on device 0, 9 layers on device 1 and 11 layers on device 2, use --beakmodel_gpulayers 8,9,11")
-parser.add_argument("--override_delete", action='store_true', help="Deleting stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow deleting stories if using --remote and prevent deleting stories otherwise.")
-parser.add_argument("--override_rename", action='store_true', help="Renaming stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow renaming stories if using --remote and prevent renaming stories otherwise.")
-parser.add_argument("--configname", help="Force a fixed configuration name to aid with config management.")
-parser.add_argument("--colab", action='store_true', help="Optimize for Google Colab.")
-parser.add_argument("--nobreakmodel", action='store_true', help="Disables Breakmodel support completely.")
-parser.add_argument("--unblock", action='store_true', default=False, help="Unblocks the KoboldAI port to be accessible from other machines without optimizing for remote play (It is recommended to use --host instead)")
-parser.add_argument("--quiet", action='store_true', default=False, help="If present will suppress any story related text from showing on the console")
-parser.add_argument("--no_aria2", action='store_true', default=False, help="Prevents KoboldAI from using aria2 to download huggingface models more efficiently, in case aria2 is causing you issues")
-parser.add_argument("--lowmem", action='store_true', help="Extra Low Memory loading for the GPU, slower but memory does not peak to twice the usage")
-parser.add_argument("--savemodel", action='store_true', help="Saves the model to the models folder even if --colab is used (Allows you to save models to Google Drive)")
-args: argparse.Namespace = None
-if(os.environ.get("KOBOLDAI_ARGS") is not None):
-    import shlex
-    args = parser.parse_args(shlex.split(os.environ["KOBOLDAI_ARGS"]))
-else:
-    args = parser.parse_args()
-
-vars.model = args.model;
-vars.revision = args.revision
-
-if args.colab:
-    args.remote = True;
-    args.override_rename = True;
-    args.override_delete = True;
-    args.nobreakmodel = True;
-    args.quiet = True;
-    args.lowmem = True;
-
-if args.quiet:
-    vars.quiet = True
-
-if args.nobreakmodel:
-    vars.nobreakmodel = True;
-
-if args.remote:
-    vars.host = True;
-
-if args.ngrok:
-    vars.host = True;
-
-if args.localtunnel:
-    vars.host = True;
-
-if args.host:
-    vars.host = True;
-
-if args.cpu:
-    vars.use_colab_tpu = False
-
-vars.smandelete = vars.host == args.override_delete
-vars.smanrename = vars.host == args.override_rename
-
-vars.aria2_port = args.aria2_port or 6799
-
-# Select a model to run
-if args.model:
-    print("Welcome to KoboldAI!\nYou have selected the following Model:", vars.model)
-    if args.path:
-        print("You have selected the following path for your Model :", args.path)
-        vars.custmodpth = args.path;
-        vars.colaburl = args.path + "/request"; # Lets just use the same parameter to keep it simple
-
-else:
-    print("{0}Welcome to the KoboldAI Server!\nListed RAM is the optimal VRAM and CPU ram can be up to twice the amount.\nMost models can run at less VRAM with reduced max tokens or less layers on the GPU.\nSelect an AI model to continue:{1}\n".format(colors.CYAN, colors.END))
-    getModelSelection(mainmenu)
-
-# If transformers model was selected & GPU available, ask to use CPU or GPU
-if(vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
-    vars.allowsp = True
-    # Test for GPU support
-    import torch
-    
-    # Make model path the same as the model name to make this consistent with the other loading method if it isn't a known model type
-    # This code is not just a workaround for below, it is also used to make the behavior consistent with other loading methods - Henk717
-    if(not vars.model in ["NeoCustom", "GPT2Custom"]):
-        vars.custmodpth = vars.model
-    elif(vars.model == "NeoCustom"):
-        vars.model = os.path.basename(os.path.normpath(vars.custmodpth))
-
-    # Get the model_type from the config or assume a model type if it isn't present
-    from transformers import AutoConfig
-    if(os.path.isdir(vars.custmodpth.replace('/', '_'))):
-        try:
-            model_config = AutoConfig.from_pretrained(vars.custmodpth.replace('/', '_'), revision=vars.revision, cache_dir="cache")
-            vars.model_type = model_config.model_type
-        except ValueError as e:
-            vars.model_type = "not_found"
-    elif(os.path.isdir("models/{}".format(vars.custmodpth.replace('/', '_')))):
-        try:
-            model_config = AutoConfig.from_pretrained("models/{}".format(vars.custmodpth.replace('/', '_')), revision=vars.revision, cache_dir="cache")
-            vars.model_type = model_config.model_type
-        except ValueError as e:
-            vars.model_type = "not_found"
+def general_startup(override_args=None):
+    global args
+    # Parsing Parameters
+    parser = argparse.ArgumentParser(description="KoboldAI Server")
+    parser.add_argument("--remote", action='store_true', help="Optimizes KoboldAI for Remote Play")
+    parser.add_argument("--noaimenu", action='store_true', help="Disables the ability to select the AI")
+    parser.add_argument("--ngrok", action='store_true', help="Optimizes KoboldAI for Remote Play using Ngrok")
+    parser.add_argument("--localtunnel", action='store_true', help="Optimizes KoboldAI for Remote Play using Localtunnel")
+    parser.add_argument("--host", action='store_true', help="Optimizes KoboldAI for Remote Play without using a proxy service")
+    parser.add_argument("--port", type=int, help="Specify the port on which the application will be joinable")
+    parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)")
+    parser.add_argument("--model", help="Specify the Model Type to skip the Menu")
+    parser.add_argument("--path", help="Specify the Path for local models (For model NeoCustom or GPT2Custom)")
+    parser.add_argument("--revision", help="Specify the model revision for huggingface models (can be a git branch/tag name or a git commit hash)")
+    parser.add_argument("--cpu", action='store_true', help="By default unattended launches are on the GPU use this option to force CPU usage.")
+    parser.add_argument("--breakmodel", action='store_true', help=argparse.SUPPRESS)
+    parser.add_argument("--breakmodel_layers", type=int, help=argparse.SUPPRESS)
+    parser.add_argument("--breakmodel_gpulayers", type=str, help="If using a model that supports hybrid generation, this is a comma-separated list that specifies how many layers to put on each GPU device. For example to put 8 layers on device 0, 9 layers on device 1 and 11 layers on device 2, use --beakmodel_gpulayers 8,9,11")
+    parser.add_argument("--override_delete", action='store_true', help="Deleting stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow deleting stories if using --remote and prevent deleting stories otherwise.")
+    parser.add_argument("--override_rename", action='store_true', help="Renaming stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow renaming stories if using --remote and prevent renaming stories otherwise.")
+    parser.add_argument("--configname", help="Force a fixed configuration name to aid with config management.")
+    parser.add_argument("--colab", action='store_true', help="Optimize for Google Colab.")
+    parser.add_argument("--nobreakmodel", action='store_true', help="Disables Breakmodel support completely.")
+    parser.add_argument("--unblock", action='store_true', default=False, help="Unblocks the KoboldAI port to be accessible from other machines without optimizing for remote play (It is recommended to use --host instead)")
+    parser.add_argument("--quiet", action='store_true', default=False, help="If present will suppress any story related text from showing on the console")
+    parser.add_argument("--no_aria2", action='store_true', default=False, help="Prevents KoboldAI from using aria2 to download huggingface models more efficiently, in case aria2 is causing you issues")
+    parser.add_argument("--lowmem", action='store_true', help="Extra Low Memory loading for the GPU, slower but memory does not peak to twice the usage")
+    parser.add_argument("--savemodel", action='store_true', help="Saves the model to the models folder even if --colab is used (Allows you to save models to Google Drive)")
+    #args: argparse.Namespace = None
+    if "pytest" in sys.modules and override_args is None:
+        args = parser.parse_args([])
+        return
+    if override_args is not None:
+        import shlex
+        args = parser.parse_args(shlex.split(override_args))
+    elif(os.environ.get("KOBOLDAI_ARGS") is not None):
+        import shlex
+        args = parser.parse_args(shlex.split(os.environ["KOBOLDAI_ARGS"]))
     else:
-        try:
-            model_config = AutoConfig.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
-            vars.model_type = model_config.model_type
-        except ValueError as e:
-            vars.model_type = "not_found"
-    if(vars.model_type == "not_found" and vars.model == "NeoCustom"):
-        vars.model_type = "gpt_neo"
-    elif(vars.model_type == "not_found" and vars.model == "GPT2Custom"):
-        vars.model_type = "gpt2"
-    elif(vars.model_type == "not_found"):
-        print("WARNING: No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)")
-        vars.model_type = "gpt_neo"
+        args = parser.parse_args()
 
-    if(vars.model_type == "opt"):
-        vars.badwordsids = vars.badwordsids_opt
+    vars.model = args.model;
+    vars.revision = args.revision
 
-if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
-    loadmodelsettings()
-    loadsettings()
-    print("{0}Looking for GPU support...{1}".format(colors.PURPLE, colors.END), end="")
-    vars.hascuda = torch.cuda.is_available()
-    vars.bmsupported = vars.model_type in ("gpt_neo", "gptj", "xglm", "opt") and not vars.nobreakmodel
-    if(args.breakmodel is not None and args.breakmodel):
-        print("WARNING: --breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --breakmodel_gpulayers is used (see --help for details).", file=sys.stderr)
-    if(args.breakmodel_layers is not None):
-        print("WARNING: --breakmodel_layers is deprecated. Use --breakmodel_gpulayers instead (see --help for details).", file=sys.stderr)
-    if(args.model and vars.bmsupported and not args.breakmodel_gpulayers and not args.breakmodel_layers):
-        print("WARNING: Model launched without the --breakmodel_gpulayers argument, defaulting to GPU only mode.", file=sys.stderr)
-        vars.bmsupported = False
-    if(not vars.bmsupported and (args.breakmodel_gpulayers is not None or args.breakmodel_layers is not None)):
-        print("WARNING: This model does not support hybrid generation. --breakmodel_gpulayers will be ignored.", file=sys.stderr)
-    if(vars.hascuda):
-        print("{0}FOUND!{1}".format(colors.GREEN, colors.END))
-    else:
-        print("{0}NOT FOUND!{1}".format(colors.YELLOW, colors.END))
+    if args.colab:
+        args.remote = True;
+        args.override_rename = True;
+        args.override_delete = True;
+        args.nobreakmodel = True;
+        args.quiet = True;
+        args.lowmem = True;
+        args.noaimenu = True;
+
+    if args.quiet:
+        vars.quiet = True
+
+    if args.nobreakmodel:
+        vars.nobreakmodel = True;
+
+    if args.remote:
+        vars.host = True;
+
+    if args.ngrok:
+        vars.host = True;
+
+    if args.localtunnel:
+        vars.host = True;
+
+    if args.host:
+        vars.host = True;
+
+    if args.cpu:
+        vars.use_colab_tpu = False
+
+    vars.smandelete = vars.host == args.override_delete
+    vars.smanrename = vars.host == args.override_rename
+
+    vars.aria2_port = args.aria2_port or 6799
     
-    if args.model:
-        if(vars.hascuda):
-            genselected = True
-            vars.usegpu = True
-            vars.breakmodel = False
-        if(vars.bmsupported):
-            vars.usegpu = False
-            vars.breakmodel = True
-        if(args.cpu):
-            vars.usegpu = False
-            vars.breakmodel = False
-    elif(vars.hascuda):    
-        if(vars.bmsupported):
-            genselected = True
-            vars.usegpu = False
-            vars.breakmodel = True
+    #Now let's look to see if we are going to force a load of a model from a user selected folder
+    if(vars.model == "selectfolder"):
+        print("{0}Please choose the folder where pytorch_model.bin is located:{1}\n".format(colors.CYAN, colors.END))
+        modpath = fileops.getdirpath(getcwd() + "/models", "Select Model Folder")
+    
+        if(modpath):
+            # Save directory to vars
+            vars.model = "NeoCustom"
+            vars.custmodpth = modpath
+    elif args.model:
+        print("Welcome to KoboldAI!\nYou have selected the following Model:", vars.model)
+        if args.path:
+            print("You have selected the following path for your Model :", args.path)
+            vars.custmodpth = args.path;
+            vars.colaburl = args.path + "/request"; # Lets just use the same parameter to keep it simple
+#==================================================================#
+# Load Model
+#==================================================================# 
+
+def tpumtjgetsofttokens():
+    soft_tokens = None
+    if(vars.sp is None):
+        global np
+        if 'np' not in globals():
+            import numpy as np
+        tensor = np.zeros((1, tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"])), dtype=np.float32)
+        rows = tensor.shape[0]
+        padding_amount = tpu_mtj_backend.params["seq"] - (tpu_mtj_backend.params["seq"] % -tpu_mtj_backend.params["cores_per_replica"]) - rows
+        tensor = np.pad(tensor, ((0, padding_amount), (0, 0)))
+        tensor = tensor.reshape(
+            tpu_mtj_backend.params["cores_per_replica"],
+            -1,
+            tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"]),
+        )
+        vars.sp = tpu_mtj_backend.shard_xmap(tensor)
+    soft_tokens = np.arange(
+        tpu_mtj_backend.params["n_vocab"] + tpu_mtj_backend.params["n_vocab_padding"],
+        tpu_mtj_backend.params["n_vocab"] + tpu_mtj_backend.params["n_vocab_padding"] + vars.sp_length,
+        dtype=np.uint32
+    )
+    return soft_tokens
+ 
+def get_model_info(model, directory=""):
+    # if the model is in the api list
+    key = False
+    breakmodel = False
+    gpu = False
+    layer_count = None
+    key_value = ""
+    break_values = []
+    url = False
+    gpu_count = torch.cuda.device_count()
+    gpu_names = []
+    for i in range(gpu_count):
+        gpu_names.append(torch.cuda.get_device_name(i))
+    if model in [x[1] for x in model_menu['apilist']]:
+        if path.exists("settings/{}.settings".format(model)):
+            with open("settings/{}.settings".format(model), "r") as file:
+                # Check if API key exists
+                js = json.load(file)
+                if("apikey" in js and js["apikey"] != ""):
+                    # API key exists, grab it and close the file
+                    key_value = js["apikey"]
+                elif 'oaiapikey' in js and js['oaiapikey'] != "":
+                    key_value = js["oaiapikey"]
+        key = True
+    elif model == 'ReadOnly':
+        pass
+    elif model == 'Colab':
+        url = True
+    elif not torch.cuda.is_available():
+        pass
+    else:
+        layer_count = get_layer_count(model, directory=directory)
+        if layer_count is None:
+            breakmodel = False
         else:
-            print("    1 - GPU\n    2 - CPU\n")
-            genselected = False
-    else:
-        genselected = False
+            breakmodel = True
+            if path.exists("settings/{}.breakmodel".format(model.replace("/", "_"))):
+                with open("settings/{}.breakmodel".format(model.replace("/", "_")), "r") as file:
+                    break_values = file.read().split(",")
+            else:
+                break_values = [layer_count]
+            break_values += [0] * (gpu_count - len(break_values))
+    #print("Model_info: {}".format({'cmd': 'selected_model_info', 'key_value': key_value, 'key':key, 
+    #                     'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel, 
+    #                     'break_values': break_values, 'gpu_count': gpu_count,
+    #                     'url': url, 'gpu_names': gpu_names}))
+    emit('from_server', {'cmd': 'selected_model_info', 'key_value': key_value, 'key':key, 
+                         'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel, 
+                         'break_values': break_values, 'gpu_count': gpu_count,
+                         'url': url, 'gpu_names': gpu_names}, broadcast=True)
+    if key_value != "":
+        get_oai_models(key_value)
+    
 
-    if(vars.hascuda):
-        while(genselected == False):
-            genselect = input("Mode> ")
-            if(genselect == ""):
-                vars.breakmodel = False
-                vars.usegpu = True
+def get_layer_count(model, directory=""):
+    if(model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]):
+        if(vars.model == "GPT2Custom"):
+            model_config = open(vars.custmodpth + "/config.json", "r")
+        # Get the model_type from the config or assume a model type if it isn't present
+        else:
+            from transformers import AutoConfig
+            if directory == "":
+                model_config = AutoConfig.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
+            elif(os.path.isdir(vars.custmodpth.replace('/', '_'))):
+                model_config = AutoConfig.from_pretrained(vars.custmodpth.replace('/', '_'), revision=vars.revision, cache_dir="cache")
+            elif(os.path.isdir(directory)):
+                model_config = AutoConfig.from_pretrained(directory, revision=vars.revision, cache_dir="cache")
+            else:
+                model_config = AutoConfig.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+        
+        
+        
+        return utils.num_layers(model_config)
+    else:
+        return None
+
+
+def get_oai_models(key):
+    vars.oaiapikey = key
+    if vars.model == 'OAI':
+        url = "https://api.openai.com/v1/engines"
+    elif vars.model == 'GooseAI':
+        url = "https://api.goose.ai/v1/engines"
+    else:
+        return
+        
+    # Get list of models from OAI
+    print("{0}Retrieving engine list...{1}".format(colors.PURPLE, colors.END), end="")
+    req = requests.get(
+        url, 
+        headers = {
+            'Authorization': 'Bearer '+key
+            }
+        )
+    if(req.status_code == 200):
+        engines = req.json()["data"]
+        try:
+            engines = [[en["id"], "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")] for en in engines]
+        except:
+            print(engines)
+            raise
+        
+        online_model = ""
+        changed=False
+        
+        #Save the key
+        if not path.exists("settings"):
+            # If the client settings file doesn't exist, create it
+            # Write API key to file
+            os.makedirs('settings', exist_ok=True)
+        if path.exists("settings/{}.settings".format(vars.model)):
+            with open("settings/{}.settings".format(vars.model), "r") as file:
+                js = json.load(file)
+                if 'online_model' in js:
+                    online_model = js['online_model']
+                if "apikey" in js:
+                    if js['apikey'] != key:
+                        changed=True
+        if changed:
+            with open("settings/{}.settings".format(vars.model), "w") as file:
+                js["apikey"] = key
+                file.write(json.dumps(js, indent=3))
+            
+        emit('from_server', {'cmd': 'oai_engines', 'data': engines, 'online_model': online_model}, broadcast=True)
+    else:
+        # Something went wrong, print the message and quit since we can't initialize an engine
+        print("{0}ERROR!{1}".format(colors.RED, colors.END))
+        print(req.json())
+        emit('from_server', {'cmd': 'errmsg', 'data': req.json()})
+        
+            
+def patch_transformers():
+    global transformers
+    old_from_pretrained = PreTrainedModel.from_pretrained.__func__
+    @classmethod
+    def new_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        vars.fp32_model = False
+        utils.num_shards = None
+        utils.current_shard = 0
+        utils.from_pretrained_model_name = pretrained_model_name_or_path
+        utils.from_pretrained_index_filename = None
+        utils.from_pretrained_kwargs = kwargs
+        utils.bar = None
+        if not args.no_aria2:
+            utils.aria2_hook(pretrained_model_name_or_path, **kwargs)
+        return old_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
+    PreTrainedModel.from_pretrained = new_from_pretrained
+    if(hasattr(modeling_utils, "get_checkpoint_shard_files")):
+        old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files
+        def new_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs):
+            utils.num_shards = utils.get_num_shards(index_filename)
+            utils.from_pretrained_index_filename = index_filename
+            return old_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs)
+        modeling_utils.get_checkpoint_shard_files = new_get_checkpoint_shard_files
+        
+    # Some versions of transformers 4.17.0.dev0 are affected by
+    # https://github.com/huggingface/transformers/issues/15736
+    # This is a workaround for those versions of transformers.
+    if(transformers_version == "4.17.0.dev0"):
+        try:
+            from transformers.models.xglm.modeling_xglm import XGLMSinusoidalPositionalEmbedding
+        except ImportError:
+            pass
+        else:
+            @torch.no_grad()
+            def new_forward(self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0):
+                bsz, seq_len = inputs_embeds.size()[:-1]
+                input_shape = inputs_embeds.size()[:-1]
+                sequence_length = input_shape[1]
+                position_ids = torch.arange(
+                    past_key_values_length + self.padding_idx + 1, past_key_values_length + sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+                ).unsqueeze(0).expand(input_shape).contiguous()
+                max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+                if max_pos > self.weights.size(0):
+                    self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+                return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
+            XGLMSinusoidalPositionalEmbedding.forward = new_forward
+
+    # Patch transformers to use our soft prompt
+    def patch_causallm(cls):
+        old_forward = cls.forward
+        def new_causallm_forward(self, *args, **kwargs):
+            input_ids = kwargs.get('input_ids').to(self.device)
+            assert input_ids is not None
+            kwargs['input_ids'] = None
+            if(vars.sp is not None):
+                shifted_input_ids = input_ids - self.config.vocab_size
+            input_ids.clamp_(max=self.config.vocab_size-1)
+            if(hasattr(self, "transformer")):
+                inputs_embeds = self.transformer.wte(input_ids)
+            elif(not hasattr(self.model, "decoder")):
+                inputs_embeds = self.model.embed_tokens(input_ids)
+            else:
+                inputs_embeds = self.model.decoder.embed_tokens(input_ids)
+            if(vars.sp is not None):
+                vars.sp = vars.sp.to(inputs_embeds.dtype).to(inputs_embeds.device)
+                inputs_embeds = torch.where(
+                    (shifted_input_ids >= 0)[..., None],
+                    vars.sp[shifted_input_ids.clamp(min=0)],
+                    inputs_embeds,
+                )
+            if(hasattr(self, "model") and hasattr(self.model, "embed_scale")):
+                inputs_embeds *= self.model.embed_scale
+            kwargs['inputs_embeds'] = inputs_embeds
+            return old_forward(self, *args, **kwargs)
+        cls.forward = new_causallm_forward
+    for cls in (GPT2LMHeadModel, GPTNeoForCausalLM):
+        patch_causallm(cls)
+    for c in ("GPTJForCausalLM", "XGLMForCausalLM", "OPTForCausalLM"):
+        try:
+            patch_causallm(getattr(__import__("transformers"), c))
+        except:
+            pass
+
+
+    # Fix a bug in OPTForCausalLM where self.lm_head is the wrong size
+    if(packaging.version.parse("4.19.0.dev0") <= packaging.version.parse(transformers_version) < packaging.version.parse("4.20.0")):
+        try:
+            from transformers import OPTForCausalLM, OPTModel
+        except ImportError:
+            pass
+        else:
+            # This is the same as the original __init__ but with
+            # config.hidden_size
+            # replaced with
+            # config.word_embed_proj_dim
+            def new_init(self, config):
+                super(OPTForCausalLM, self).__init__(config)
+                self.model = OPTModel(config)
+                self.lm_head = torch.nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False)
+                self.post_init()
+            OPTForCausalLM.__init__ = new_init
+
+
+    # Patch transformers to use our custom logit warpers
+    from transformers import LogitsProcessorList, LogitsWarper, LogitsProcessor, TopKLogitsWarper, TopPLogitsWarper, TemperatureLogitsWarper, RepetitionPenaltyLogitsProcessor
+    from warpers import AdvancedRepetitionPenaltyLogitsProcessor, TailFreeLogitsWarper, TypicalLogitsWarper, TopALogitsWarper
+
+    def dynamic_processor_wrap(cls, field_name, var_name, cond=None):
+        old_call = cls.__call__
+        def new_call(self, *args, **kwargs):
+            if(not isinstance(field_name, str) and isinstance(field_name, Iterable)):
+                conds = []
+                for f, v in zip(field_name, var_name):
+                    conds.append(getattr(vars, v))
+                    setattr(self, f, conds[-1])
+            else:
+                conds = getattr(vars, var_name)
+                setattr(self, field_name, conds)
+            assert len(args) == 2
+            if(cond is None or cond(conds)):
+                return old_call(self, *args, **kwargs)
+            return args[1]
+        cls.__call__ = new_call
+    dynamic_processor_wrap(AdvancedRepetitionPenaltyLogitsProcessor, ("penalty", "penalty_slope", "penalty_range"), ("rep_pen", "rep_pen_slope", "rep_pen_range"), cond=lambda x: x[0] != 1.0)
+    dynamic_processor_wrap(TopKLogitsWarper, "top_k", "top_k", cond=lambda x: x > 0)
+    dynamic_processor_wrap(TopALogitsWarper, "top_a", "top_a", cond=lambda x: x > 0.0)
+    dynamic_processor_wrap(TopPLogitsWarper, "top_p", "top_p", cond=lambda x: x < 1.0)
+    dynamic_processor_wrap(TailFreeLogitsWarper, "tfs", "tfs", cond=lambda x: x < 1.0)
+    dynamic_processor_wrap(TypicalLogitsWarper, "typical", "typical", cond=lambda x: x < 1.0)
+    dynamic_processor_wrap(TemperatureLogitsWarper, "temperature", "temp", cond=lambda x: x != 1.0)
+    RepetitionPenaltyLogitsProcessor.__init__ = AdvancedRepetitionPenaltyLogitsProcessor.__init__
+    RepetitionPenaltyLogitsProcessor.__call__ = AdvancedRepetitionPenaltyLogitsProcessor.__call__
+
+    class LuaLogitsProcessor(LogitsProcessor):
+
+        def __init__(self):
+            pass
+
+        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+            assert scores.ndim == 2
+            assert input_ids.ndim == 2
+            self.regeneration_required = False
+            self.halt = False
+
+            scores_shape = scores.shape
+            scores_list = scores.tolist()
+            vars.lua_koboldbridge.logits = vars.lua_state.table()
+            for r, row in enumerate(scores_list):
+                vars.lua_koboldbridge.logits[r+1] = vars.lua_state.table(*row)
+            vars.lua_koboldbridge.vocab_size = scores_shape[-1]
+
+            execute_genmod()
+
+            scores = torch.tensor(
+                tuple(tuple(row.values()) for row in vars.lua_koboldbridge.logits.values()),
+                device=scores.device,
+                dtype=scores.dtype,
+            )
+            assert scores.shape == scores_shape
+
+            return scores
+    
+    def new_get_logits_processor(*args, **kwargs) -> LogitsProcessorList:
+        processors = new_get_logits_processor.old_get_logits_processor(*args, **kwargs)
+        processors.insert(0, LuaLogitsProcessor())
+        return processors
+    new_get_logits_processor.old_get_logits_processor = transformers.generation_utils.GenerationMixin._get_logits_processor
+    transformers.generation_utils.GenerationMixin._get_logits_processor = new_get_logits_processor
+
+    class KoboldLogitsWarperList(LogitsProcessorList):
+        def __init__(self, beams: int = 1, **kwargs):
+            self.__warper_list: List[LogitsWarper] = []
+            self.__warper_list.append(TopKLogitsWarper(top_k=1, min_tokens_to_keep=1 + (beams > 1)))
+            self.__warper_list.append(TopALogitsWarper(top_a=0.5, min_tokens_to_keep=1 + (beams > 1)))
+            self.__warper_list.append(TopPLogitsWarper(top_p=0.5, min_tokens_to_keep=1 + (beams > 1)))
+            self.__warper_list.append(TailFreeLogitsWarper(tfs=0.5, min_tokens_to_keep=1 + (beams > 1)))
+            self.__warper_list.append(TypicalLogitsWarper(typical=0.5, min_tokens_to_keep=1 + (beams > 1)))
+            self.__warper_list.append(TemperatureLogitsWarper(temperature=0.5))
+
+        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, *args, **kwargs):
+            for k in vars.sampler_order:
+                scores = self.__warper_list[k](input_ids, scores, *args, **kwargs)
+            return scores
+
+    def new_get_logits_warper(beams: int = 1,) -> LogitsProcessorList:
+        return KoboldLogitsWarperList(beams=beams)
+    
+    def new_sample(self, *args, **kwargs):
+        assert kwargs.pop("logits_warper", None) is not None
+        kwargs["logits_warper"] = new_get_logits_warper(
+            beams=1,
+        )
+        if(vars.newlinemode == "s") or (vars.newlinemode == "ns"):
+            kwargs["eos_token_id"] = -1
+            kwargs.setdefault("pad_token_id", 2)
+        return new_sample.old_sample(self, *args, **kwargs)
+    new_sample.old_sample = transformers.generation_utils.GenerationMixin.sample
+    transformers.generation_utils.GenerationMixin.sample = new_sample
+
+
+    # Allow bad words filter to ban <|endoftext|> token
+    import transformers.generation_logits_process
+    def new_init(self, bad_words_ids: List[List[int]], eos_token_id: int):
+        return new_init.old_init(self, bad_words_ids, -1)
+    new_init.old_init = transformers.generation_logits_process.NoBadWordsLogitsProcessor.__init__
+    transformers.generation_logits_process.NoBadWordsLogitsProcessor.__init__ = new_init
+
+
+    # Sets up dynamic world info scanner
+    class DynamicWorldInfoScanCriteria(StoppingCriteria):
+        def __init__(
+            self,
+            tokenizer,
+            excluded_world_info: List[Set],
+        ):
+            self.regeneration_required = False
+            self.halt = False
+            self.tokenizer = tokenizer
+            self.excluded_world_info = excluded_world_info
+        def __call__(
+            self,
+            input_ids: torch.LongTensor,
+            scores: torch.FloatTensor,
+            **kwargs,
+        ) -> bool:
+            vars.generated_tkns += 1
+            if(vars.lua_koboldbridge.generated_cols and vars.generated_tkns != vars.lua_koboldbridge.generated_cols):
+                raise RuntimeError(f"Inconsistency detected between KoboldAI Python and Lua backends ({vars.generated_tkns} != {vars.lua_koboldbridge.generated_cols})")
+            if(vars.abort or vars.generated_tkns >= vars.genamt):
+                self.regeneration_required = False
+                self.halt = False
+                return True
+
+            assert input_ids.ndim == 2
+            assert len(self.excluded_world_info) == input_ids.shape[0]
+            self.regeneration_required = vars.lua_koboldbridge.regeneration_required
+            self.halt = not vars.lua_koboldbridge.generating
+            vars.lua_koboldbridge.regeneration_required = False
+
+            for i in range(vars.numseqs):
+                vars.lua_koboldbridge.generated[i+1][vars.generated_tkns] = int(input_ids[i, -1].item())
+
+            if(not vars.dynamicscan):
+                return self.regeneration_required or self.halt
+            tail = input_ids[..., -vars.generated_tkns:]
+            for i, t in enumerate(tail):
+                decoded = utils.decodenewlines(tokenizer.decode(t))
+                _, found = checkworldinfo(decoded, force_use_txt=True, actions=vars._actions)
+                found -= self.excluded_world_info[i]
+                if(len(found) != 0):
+                    self.regeneration_required = True
+                    break
+            return self.regeneration_required or self.halt
+    old_get_stopping_criteria = transformers.generation_utils.GenerationMixin._get_stopping_criteria
+    def new_get_stopping_criteria(self, *args, **kwargs):
+        stopping_criteria = old_get_stopping_criteria(self, *args, **kwargs)
+        global tokenizer
+        self.kai_scanner = DynamicWorldInfoScanCriteria(
+            tokenizer=tokenizer,
+            excluded_world_info=self.kai_scanner_excluded_world_info,
+        )
+        stopping_criteria.insert(0, self.kai_scanner)
+        return stopping_criteria
+    transformers.generation_utils.GenerationMixin._get_stopping_criteria = new_get_stopping_criteria
+
+def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model=""):
+    global model
+    global generator
+    global torch
+    global model_config
+    global GPT2TokenizerFast
+    global tokenizer
+    print("Loading vars.model: {} vars.custmodpth: {}".format(vars.model, vars.custmodpth))
+    vars.noai = False
+    if not initial_load:
+        set_aibusy(True)
+        if vars.model != 'ReadOnly':
+            emit('from_server', {'cmd': 'model_load_status', 'data': "Loading {}".format(vars.model)}, broadcast=True)
+            #Have to add a sleep so the server will send the emit for some reason
+            time.sleep(0.1)
+    if gpu_layers is not None:
+        args.breakmodel_gpulayers = gpu_layers
+    
+    #We need to wipe out the existing model and refresh the cuda cache
+    model = None
+    generator = None
+    model_config = None
+    try:
+        torch.cuda.empty_cache()
+    except:
+        pass
+        
+    #Reload our badwords
+    vars.badwordsids = vars.badwordsids_default
+    
+    #Let's set the GooseAI or OpenAI server URLs if that's applicable
+    if online_model != "":
+        if path.exists("settings/{}.settings".format(vars.model)):
+            changed=False
+            with open("settings/{}.settings".format(vars.model), "r") as file:
+                # Check if API key exists
+                js = json.load(file)
+                if 'online_model' in js:
+                    if js['online_model'] != online_model:
+                        changed=True
+                        js['online_model'] = online_model
+                else:
+                    changed=True
+                    js['online_model'] = online_model
+            if changed:
+                with open("settings/{}.settings".format(vars.model), "w") as file:
+                    file.write(json.dumps(js, indent=3))
+        # Swap OAI Server if GooseAI was selected
+        if(vars.model == "GooseAI"):
+            vars.oaiengines = "https://api.goose.ai/v1/engines"
+            vars.model = "OAI"
+            args.configname = "GooseAI" + "/" + online_model
+        else:
+            args.configname = vars.model + "/" + online_model
+        vars.oaiurl = vars.oaiengines + "/{0}/completions".format(online_model)
+    
+    
+    # If transformers model was selected & GPU available, ask to use CPU or GPU
+    if(vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+        vars.allowsp = True
+        # Test for GPU support
+        
+        # Make model path the same as the model name to make this consistent with the other loading method if it isn't a known model type
+        # This code is not just a workaround for below, it is also used to make the behavior consistent with other loading methods - Henk717
+        if(not vars.model in ["NeoCustom", "GPT2Custom"]):
+            vars.custmodpth = vars.model
+        elif(vars.model == "NeoCustom"):
+            vars.model = os.path.basename(os.path.normpath(vars.custmodpth))
+
+        # Get the model_type from the config or assume a model type if it isn't present
+        from transformers import AutoConfig
+        if(os.path.isdir(vars.custmodpth.replace('/', '_'))):
+            try:
+                model_config = AutoConfig.from_pretrained(vars.custmodpth.replace('/', '_'), revision=vars.revision, cache_dir="cache")
+                vars.model_type = model_config.model_type
+            except ValueError as e:
+                vars.model_type = "not_found"
+        elif(os.path.isdir("models/{}".format(vars.custmodpth.replace('/', '_')))):
+            try:
+                model_config = AutoConfig.from_pretrained("models/{}".format(vars.custmodpth.replace('/', '_')), revision=vars.revision, cache_dir="cache")
+                vars.model_type = model_config.model_type
+            except ValueError as e:
+                vars.model_type = "not_found"
+        else:
+            try:
+                model_config = AutoConfig.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                vars.model_type = model_config.model_type
+            except ValueError as e:
+                vars.model_type = "not_found"
+        if(vars.model_type == "not_found" and vars.model == "NeoCustom"):
+            vars.model_type = "gpt_neo"
+        elif(vars.model_type == "not_found" and vars.model == "GPT2Custom"):
+            vars.model_type = "gpt2"
+        elif(vars.model_type == "not_found"):
+            print("WARNING: No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)")
+            vars.model_type = "gpt_neo"
+
+        if(vars.model_type == "opt"):
+            vars.badwordsids = vars.badwordsids_opt
+
+    if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+        loadmodelsettings()
+        loadsettings()
+        print("{0}Looking for GPU support...{1}".format(colors.PURPLE, colors.END), end="")
+        vars.hascuda = torch.cuda.is_available()
+        vars.bmsupported = vars.model_type in ("gpt_neo", "gptj", "xglm", "opt") and not vars.nobreakmodel
+        if(args.breakmodel is not None and args.breakmodel):
+            print("WARNING: --breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --breakmodel_gpulayers is used (see --help for details).", file=sys.stderr)
+        if(args.breakmodel_layers is not None):
+            print("WARNING: --breakmodel_layers is deprecated. Use --breakmodel_gpulayers instead (see --help for details).", file=sys.stderr)
+        if(args.model and vars.bmsupported and not args.breakmodel_gpulayers and not args.breakmodel_layers):
+            print("WARNING: Model launched without the --breakmodel_gpulayers argument, defaulting to GPU only mode.", file=sys.stderr)
+            vars.bmsupported = False
+        if(not vars.bmsupported and (args.breakmodel_gpulayers is not None or args.breakmodel_layers is not None)):
+            print("WARNING: This model does not support hybrid generation. --breakmodel_gpulayers will be ignored.", file=sys.stderr)
+        if(vars.hascuda):
+            print("{0}FOUND!{1}".format(colors.GREEN, colors.END))
+        else:
+            print("{0}NOT FOUND!{1}".format(colors.YELLOW, colors.END))
+        
+        if args.model:
+            if(vars.hascuda):
                 genselected = True
-            elif(genselect.isnumeric() and int(genselect) == 1):
+                vars.usegpu = True
+                vars.breakmodel = False
+            if(vars.bmsupported):
+                vars.usegpu = False
+                vars.breakmodel = True
+            if(args.cpu):
+                vars.usegpu = False
+                vars.breakmodel = False
+        elif(vars.hascuda):    
+            if(vars.bmsupported):
+                genselected = True
+                vars.usegpu = False
+                vars.breakmodel = True
+            else:
+                genselected = False
+        else:
+            genselected = False
+
+        if(vars.hascuda):
+            if(use_gpu):
                 if(vars.bmsupported):
                     vars.breakmodel = True
                     vars.usegpu = False
@@ -1024,172 +1597,317 @@ if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "Go
                     vars.breakmodel = False
                     vars.usegpu = True
                     genselected = True
-            elif(genselect.isnumeric() and int(genselect) == 2):
+            else:
                 vars.breakmodel = False
                 vars.usegpu = False
                 genselected = True
-            else:
-                print("{0}Please enter a valid selection.{1}".format(colors.RED, colors.END))
 
-# Ask for API key if InferKit was selected
-if(vars.model == "InferKit"):
-    if(not path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")):
-        # If the client settings file doesn't exist, create it
-        print("{0}Please enter your InferKit API key:{1}\n".format(colors.CYAN, colors.END))
-        vars.apikey = input("Key> ")
-        # Write API key to file
-        os.makedirs('settings', exist_ok=True)
-        file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "w")
-        try:
-            js = {"apikey": vars.apikey}
-            file.write(json.dumps(js, indent=3))
-        finally:
-            file.close()
-    else:
-        # Otherwise open it up
-        file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r")
-        # Check if API key exists
-        js = json.load(file)
-        if("apikey" in js and js["apikey"] != ""):
-            # API key exists, grab it and close the file
-            vars.apikey = js["apikey"]
-            file.close()
-        else:
-            # Get API key, add it to settings object, and write it to disk
-            print("{0}Please enter your InferKit API key:{1}\n".format(colors.CYAN, colors.END))
-            vars.apikey = input("Key> ")
-            js["apikey"] = vars.apikey
-            # Write API key to file
-            file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "w")
-            try:
-                file.write(json.dumps(js, indent=3))
-            finally:
-                file.close()
-                
-# Swap OAI Server if GooseAI was selected
-if(vars.model == "GooseAI"):
-    vars.oaiengines = "https://api.goose.ai/v1/engines"
-    vars.model = "OAI"
-    args.configname = "GooseAI"
+    # Ask for API key if InferKit was selected
+    if(vars.model == "InferKit"):
+        vars.apikey = vars.oaiapikey
+                    
+    # Swap OAI Server if GooseAI was selected
+    if(vars.model == "GooseAI"):
+        vars.oaiengines = "https://api.goose.ai/v1/engines"
+        vars.model = "OAI"
+        args.configname = "GooseAI"
 
-# Ask for API key if OpenAI was selected
-if(vars.model == "OAI"):
-    if not args.configname:
-        args.configname = "OAI"
-    if(not path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")):
-        # If the client settings file doesn't exist, create it
-        print("{0}Please enter your API key:{1}\n".format(colors.CYAN, colors.END))
-        vars.oaiapikey = input("Key> ")
-        # Write API key to file
-        os.makedirs('settings', exist_ok=True)
-        file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "w")
-        try:
-            js = {"oaiapikey": vars.oaiapikey}
-            file.write(json.dumps(js, indent=3))
-        finally:
-            file.close()
-    else:
-        # Otherwise open it up
-        file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r")
-        # Check if API key exists
-        js = json.load(file)
-        if("oaiapikey" in js and js["oaiapikey"] != ""):
-            # API key exists, grab it and close the file
-            vars.oaiapikey = js["oaiapikey"]
-            file.close()
-        else:
-            # Get API key, add it to settings object, and write it to disk
-            print("{0}Please enter your API key:{1}\n".format(colors.CYAN, colors.END))
-            vars.oaiapikey = input("Key> ")
-            js["oaiapikey"] = vars.oaiapikey
-            # Write API key to file
-            file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "w")
-            try:
-                file.write(json.dumps(js, indent=3))
-            finally:
-                file.close()
-    
-    if vars.custmodpth:
-        vars.oaiurl = vars.oaiengines + "/" + vars.custmodpth + "/completions"
-        args.configname = args.configname + "/" + vars.custmodpth
-        engselected = True
-    else:
-        # Get list of models from OAI
-        print("{0}Retrieving engine list...{1}".format(colors.PURPLE, colors.END), end="")
-        req = requests.get(
-            vars.oaiengines, 
-            headers = {
-                'Authorization': 'Bearer '+vars.oaiapikey
-                }
-            )
-        if(req.status_code == 200):
-            print("{0}OK!{1}".format(colors.GREEN, colors.END))
-            print("{0}Please select an engine to use:{1}\n".format(colors.CYAN, colors.END))
-            engines = req.json()["data"]
-            # Print list of engines
-            i = 0
-            for en in engines:
-                print("    {0} - {1} ({2})".format(i, en["id"], "\033[92mready\033[0m" if en["ready"] == True else "\033[91mnot ready\033[0m"))
-                i += 1
-            # Get engine to use
-            print("")
-            engselected = False
-            while(engselected == False):
-                engine = input("Engine #> ")
-                if(engine.isnumeric() and int(engine) < len(engines)):
-                    vars.oaiurl = vars.oaiengines + "/{0}/completions".format(engines[int(engine)]["id"])
-                    args.configname = args.configname + "/" + engines[int(engine)]["id"]
-                    engselected = True
+    # Ask for API key if OpenAI was selected
+    if(vars.model == "OAI"):
+        if not args.configname:
+            args.configname = "OAI"
+        
+    if(vars.model == "ReadOnly"):
+        vars.noai = True
+
+    # Start transformers and create pipeline
+    if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+        if(not vars.noai):
+            print("{0}Initializing transformers, please wait...{1}".format(colors.PURPLE, colors.END))
+            for m in ("GPTJModel", "XGLMModel"):
+                try:
+                    globals()[m] = getattr(__import__("transformers"), m)
+                except:
+                    pass
+
+            # Lazy loader
+            import torch_lazy_loader
+            def get_lazy_load_callback(n_layers, convert_to_float16=True):
+                if not vars.lazy_load:
+                    return
+
+                from tqdm.auto import tqdm
+
+                if "breakmodel" in globals():
+                    gpu_blocks = breakmodel.gpu_blocks
+                    ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
+                    cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
                 else:
-                    print("{0}Please enter a valid selection.{1}".format(colors.RED, colors.END))
+                    ram_blocks = gpu_blocks = cumulative_gpu_blocks = None
+
+                def lazy_load_callback(model_dict, f, **_):
+                    if lazy_load_callback.nested:
+                        return
+                    lazy_load_callback.nested = True
+
+                    device_map = {}
+
+                    for _key, spec in lazy_load_spec.get("layer_weights", {}).items():
+                        for layer in range(n_layers):
+                            key = _key.format(layer=layer)
+                            if key not in model_dict:
+                                continue
+                            device = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel or layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
+                            device_map[key] = device
+
+                    for key, value in model_dict.items():
+                        if isinstance(value, torch_lazy_loader.LazyTensor) and key not in device_map:
+                            device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu"
+
+                    if utils.num_shards is None or utils.current_shard == 0:
+                        if utils.num_shards is not None:
+                            num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
+                        else:
+                            num_tensors = len(device_map)
+                        print(flush=True)
+                        utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=Send_to_socketio())
+
+                    with zipfile.ZipFile(f, "r") as z:
+                        try:
+                            last_storage_key = None
+                            f = None
+                            current_offset = 0
+                            if utils.num_shards is not None:
+                                utils.current_shard += 1
+                            for key in sorted(device_map.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)):
+                                storage_key = model_dict[key].key
+                                if storage_key != last_storage_key or model_dict[key].seek_offset < current_offset:
+                                    last_storage_key = storage_key
+                                    if isinstance(f, zipfile.ZipExtFile):
+                                        f.close()
+                                    f = z.open(f"archive/data/{storage_key}")
+                                    current_offset = 0
+                                if current_offset != model_dict[key].seek_offset:
+                                    f.read(model_dict[key].seek_offset - current_offset)
+                                    current_offset = model_dict[key].seek_offset
+                                device = device_map[key]
+                                size = functools.reduce(lambda x, y: x * y, model_dict[key].shape, 1)
+                                dtype = model_dict[key].dtype
+                                nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
+                                #print(f"Transferring <{key}>  to  {'(CPU)' if device == 'cpu' else '[device ' + str(device) + ']'} ... ", end="", flush=True)
+                                model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
+                                if model_dict[key].dtype is torch.float32:
+                                    vars.fp32_model = True
+                                if convert_to_float16 and vars.hascuda and (vars.breakmodel or vars.usegpu) and model_dict[key].dtype is torch.float32:
+                                    model_dict[key] = model_dict[key].to(torch.float16)
+                                if not vars.usegpu and not vars.breakmodel and model_dict[key].dtype is torch.float16:
+                                    model_dict[key] = model_dict[key].to(torch.float32)
+                                model_dict[key] = model_dict[key].to(device)
+                                #print("OK", flush=True)
+                                current_offset += nbytes
+                                utils.bar.update(1)
+                        finally:
+                            if utils.num_shards is None or utils.current_shard >= utils.num_shards:
+                                utils.bar.close()
+                                utils.bar = None
+                            lazy_load_callback.nested = False
+                            if isinstance(f, zipfile.ZipExtFile):
+                                f.close()
+
+                lazy_load_callback.nested = False
+                return lazy_load_callback
+
+            lazy_load_config_path = os.path.join("maps", vars.model_type + ".json")
+            if(vars.lazy_load and "model_config" in globals() and os.path.isfile(lazy_load_config_path)):
+                with open(lazy_load_config_path) as f:
+                    lazy_load_spec = json.load(f)
+
+            else:
+                vars.lazy_load = False
+
+            
+
+            def get_hidden_size_from_model(model):
+                try:
+                    return int(model.model.decoder.project_in.in_features)
+                except:
+                    try:
+                        return int(model.model.decoder.embed_tokens.out_features)
+                    except:
+                        try:
+                            return int(model.transformer.hidden_size)
+                        except:
+                            try:
+                                return int(model.transformer.embed_dim)
+                            except:
+                                return int(model.lm_head.in_features)
+            
+            def maybe_low_cpu_mem_usage() -> Dict[str, Any]:
+                if(packaging.version.parse(transformers_version) < packaging.version.parse("4.11.0")):
+                    print(f"\nWARNING:  Please upgrade to transformers 4.11.0 for lower RAM usage.  You have transformers {transformers_version}.", file=sys.stderr)
+                    return {}
+                return {"low_cpu_mem_usage": True}
+            
+            @contextlib.contextmanager
+            def maybe_use_float16(always_use=False):
+                if(always_use or (vars.hascuda and args.lowmem and (vars.usegpu or vars.breakmodel))):
+                    original_dtype = torch.get_default_dtype()
+                    torch.set_default_dtype(torch.float16)
+                    yield True
+                    torch.set_default_dtype(original_dtype)
+                else:
+                    yield False
+
+            # If custom GPT2 model was chosen
+            if(vars.model == "GPT2Custom"):
+                vars.lazy_load = False
+                model_config = open(vars.custmodpth + "/config.json", "r")
+                js   = json.load(model_config)
+                with(maybe_use_float16()):
+                    model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                vars.modeldim = get_hidden_size_from_model(model)
+                # Is CUDA available? If so, use GPU, otherwise fall back to CPU
+                if(vars.hascuda and vars.usegpu):
+                    model = model.half().to(vars.gpu_device)
+                    generator = model.generate
+                else:
+                    model = model.to('cpu').float()
+                    generator = model.generate
+            # Use the Generic implementation
+            else:
+                lowmem = maybe_low_cpu_mem_usage()
+                # We must disable low_cpu_mem_usage (by setting lowmem to {}) if
+                # using a GPT-2 model because GPT-2 is not compatible with this
+                # feature yet
+                if(vars.model_type == "gpt2"):
+                    lowmem = {}
+                
+                # If we're using torch_lazy_loader, we need to get breakmodel config
+                # early so that it knows where to load the individual model tensors
+                if(vars.lazy_load and vars.hascuda and vars.breakmodel):
+                    device_config(model_config)
+
+                # Download model from Huggingface if it does not exist, otherwise load locally
+                
+                #If we specify a model and it's in the root directory, we need to move it to the models directory (legacy folder structure to new)
+                if os.path.isdir(vars.model.replace('/', '_')):
+                    import shutil
+                    shutil.move(vars.model.replace('/', '_'), "models/{}".format(vars.model.replace('/', '_')))
+                print("\n", flush=True)
+                with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if vars.lazy_load else None, dematerialized_modules=True):
+                    if(vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
+                        lowmem = {}
+                    if(os.path.isdir(vars.custmodpth)):
+                        try:
+                            tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                        except Exception as e:
+                            try:
+                                tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                            except Exception as e:
+                                tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+                        try:
+                            model     = AutoModelForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
+                        except Exception as e:
+                            model     = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
+                    elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
+                        try:
+                            tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
+                        except Exception as e:
+                            try:
+                                tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
+                            except Exception as e:
+                                tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+                        try:
+                            model     = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem)
+                        except Exception as e:
+                            model     = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem)
+                    else:
+                        old_rebuild_tensor = torch._utils._rebuild_tensor
+                        def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride):
+                            if(not isinstance(storage, torch_lazy_loader.LazyTensor)):
+                                dtype = storage.dtype
+                            else:
+                                dtype = storage.storage_type.dtype
+                                if(not isinstance(dtype, torch.dtype)):
+                                    dtype = storage.storage_type(0).dtype
+                            if(dtype is torch.float32 and len(shape) >= 2):
+                                vars.fp32_model = True
+                            return old_rebuild_tensor(storage, storage_offset, shape, stride)
+                        torch._utils._rebuild_tensor = new_rebuild_tensor
+
+                        try:
+                            tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
+                        except Exception as e:
+                            try:
+                                tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
+                            except Exception as e:
+                                tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+                        try:
+                            model     = AutoModelForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem)
+                        except Exception as e:
+                            model     = GPTNeoForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem)
+
+                        torch._utils._rebuild_tensor = old_rebuild_tensor
+
+                        if not args.colab or args.savemodel:
+                            import shutil
+                            tokenizer.save_pretrained("models/{}".format(vars.model.replace('/', '_')))
+                            if(vars.fp32_model):  # Use save_pretrained to convert fp32 models to fp16
+                                model = model.half()
+                                model.save_pretrained("models/{}".format(vars.model.replace('/', '_')), max_shard_size="500MiB")
+                            else:  # For fp16 models, we can just copy the model files directly
+                                import transformers.configuration_utils
+                                import transformers.modeling_utils
+                                import transformers.file_utils
+                                # Save the config.json
+                                shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, transformers.configuration_utils.CONFIG_NAME, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.configuration_utils.CONFIG_NAME))
+                                if(utils.num_shards is None):
+                                    # Save the pytorch_model.bin of an unsharded model
+                                    shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, transformers.modeling_utils.WEIGHTS_NAME, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.modeling_utils.WEIGHTS_NAME))
+                                else:
+                                    with open(utils.from_pretrained_index_filename) as f:
+                                        map_data = json.load(f)
+                                    filenames = set(map_data["weight_map"].values())
+                                    # Save the pytorch_model.bin.index.json of a sharded model
+                                    shutil.move(utils.from_pretrained_index_filename, os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.modeling_utils.WEIGHTS_INDEX_NAME))
+                                    # Then save the pytorch_model-#####-of-#####.bin files
+                                    for filename in filenames:
+                                        shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, filename, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), filename))
+                            shutil.rmtree("cache/")
+                
+                if(vars.hascuda):
+                    if(vars.usegpu):
+                        vars.modeldim = get_hidden_size_from_model(model)
+                        model = model.half().to(vars.gpu_device)
+                        generator = model.generate
+                    elif(vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
+                        vars.modeldim = get_hidden_size_from_model(model)
+                        if(not vars.lazy_load):
+                            device_config(model.config)
+                        move_model_to_devices(model)
+                    else:
+                        model = model.to('cpu').float()
+                        vars.modeldim = get_hidden_size_from_model(model)
+                        generator = model.generate
+                else:
+                    model.to('cpu').float()
+                    vars.modeldim = get_hidden_size_from_model(model)
+                    generator = model.generate
+            
+            # Suppress Author's Note by flagging square brackets (Old implementation)
+            #vocab         = tokenizer.get_vocab()
+            #vocab_keys    = vocab.keys()
+            #vars.badwords = gettokenids("[")
+            #for key in vars.badwords:
+            #    vars.badwordsids.append([vocab[key]])
+            
+            print("{0}OK! {1} pipeline created!{2}".format(colors.GREEN, vars.model, colors.END))
+        
         else:
-            # Something went wrong, print the message and quit since we can't initialize an engine
-            print("{0}ERROR!{1}".format(colors.RED, colors.END))
-            print(req.json())
-            quit()
-
-# Ask for ngrok url if Google Colab was selected
-if(vars.model == "Colab"):
-    if(vars.colaburl == ""):
-        print("{0}NOTE: For the modern KoboldAI Colab's you open the links directly in your browser.\nThis option is only for the KoboldAI Server API, not all features are supported in this mode.\n".format(colors.YELLOW, colors.END))
-        print("{0}Enter the URL of the server (For example a trycloudflare link):{1}\n".format(colors.CYAN, colors.END))
-        vars.colaburl = input("URL> ") + "/request"
-
-if(vars.model == "ReadOnly"):
-    vars.noai = True
-
-# Set logging level to reduce chatter from Flask
-import logging
-log = logging.getLogger('werkzeug')
-log.setLevel(logging.ERROR)
-
-# Start flask & SocketIO
-print("{0}Initializing Flask... {1}".format(colors.PURPLE, colors.END), end="")
-from flask import Flask, render_template, Response, request, copy_current_request_context
-from flask_socketio import SocketIO, emit
-app = Flask(__name__, root_path=os.getcwd())
-app.config['SECRET KEY'] = 'secret!'
-socketio = SocketIO(app, async_method="eventlet")
-socketio.start_background_task(check_for_sp_change)
-print("{0}OK!{1}".format(colors.GREEN, colors.END))
-
-# Start transformers and create pipeline
-if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
-    if(not vars.noai):
-        print("{0}Initializing transformers, please wait...{1}".format(colors.PURPLE, colors.END))
-        from transformers import StoppingCriteria, GPT2TokenizerFast, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoTokenizer
-        for m in ("GPTJModel", "XGLMModel"):
-            try:
-                globals()[m] = getattr(__import__("transformers"), m)
-            except:
-                pass
-        try:
-            from transformers.models.opt.modeling_opt import OPTDecoder
-        except:
-            pass
-        import transformers.generation_utils
-        from transformers import __version__ as transformers_version
-
+            from transformers import GPT2TokenizerFast
+            tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+    else:
         from transformers import PreTrainedModel
         from transformers import modeling_utils
         old_from_pretrained = PreTrainedModel.from_pretrained.__func__
@@ -1214,673 +1932,135 @@ if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "Go
                 return old_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs)
             modeling_utils.get_checkpoint_shard_files = new_get_checkpoint_shard_files
 
-        # Lazy loader
-        import torch_lazy_loader
-        def get_lazy_load_callback(n_layers, convert_to_float16=True):
-            if not vars.lazy_load:
-                return
 
-            from tqdm.auto import tqdm
+        def tpumtjgenerate_warper_callback(scores) -> "np.array":
+            scores_shape = scores.shape
+            scores_list = scores.tolist()
+            vars.lua_koboldbridge.logits = vars.lua_state.table()
+            for r, row in enumerate(scores_list):
+                vars.lua_koboldbridge.logits[r+1] = vars.lua_state.table(*row)
+            vars.lua_koboldbridge.vocab_size = scores_shape[-1]
 
-            if "breakmodel" in globals():
-                gpu_blocks = breakmodel.gpu_blocks
-                ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
-                cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
-            else:
-                ram_blocks = gpu_blocks = cumulative_gpu_blocks = None
+            execute_genmod()
 
-            def lazy_load_callback(model_dict, f, **_):
-                if lazy_load_callback.nested:
-                    return
-                lazy_load_callback.nested = True
-
-                device_map = {}
-
-                for _key, spec in lazy_load_spec.get("layer_weights", {}).items():
-                    for layer in range(n_layers):
-                        key = _key.format(layer=layer)
-                        if key not in model_dict:
-                            continue
-                        device = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel or layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
-                        device_map[key] = device
-
-                for key, value in model_dict.items():
-                    if isinstance(value, torch_lazy_loader.LazyTensor) and key not in device_map:
-                        device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu"
-
-                if utils.num_shards is None or utils.current_shard == 0:
-                    if utils.num_shards is not None:
-                        num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
-                    else:
-                        num_tensors = len(device_map)
-                    print(flush=True)
-                    utils.bar = tqdm(total=num_tensors, desc="Loading model tensors")
-
-                with zipfile.ZipFile(f, "r") as z:
-                    try:
-                        last_storage_key = None
-                        f = None
-                        current_offset = 0
-                        if utils.num_shards is not None:
-                            utils.current_shard += 1
-                        for key in sorted(device_map.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)):
-                            storage_key = model_dict[key].key
-                            if storage_key != last_storage_key or model_dict[key].seek_offset < current_offset:
-                                last_storage_key = storage_key
-                                if isinstance(f, zipfile.ZipExtFile):
-                                    f.close()
-                                f = z.open(f"archive/data/{storage_key}")
-                                current_offset = 0
-                            if current_offset != model_dict[key].seek_offset:
-                                f.read(model_dict[key].seek_offset - current_offset)
-                                current_offset = model_dict[key].seek_offset
-                            device = device_map[key]
-                            size = functools.reduce(lambda x, y: x * y, model_dict[key].shape, 1)
-                            dtype = model_dict[key].dtype
-                            nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
-                            #print(f"Transferring <{key}>  to  {'(CPU)' if device == 'cpu' else '[device ' + str(device) + ']'} ... ", end="", flush=True)
-                            model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
-                            if model_dict[key].dtype is torch.float32:
-                                vars.fp32_model = True
-                            if convert_to_float16 and vars.hascuda and (vars.breakmodel or vars.usegpu) and model_dict[key].dtype is torch.float32:
-                                model_dict[key] = model_dict[key].to(torch.float16)
-                            if not vars.usegpu and not vars.breakmodel and model_dict[key].dtype is torch.float16:
-                                model_dict[key] = model_dict[key].to(torch.float32)
-                            model_dict[key] = model_dict[key].to(device)
-                            #print("OK", flush=True)
-                            current_offset += nbytes
-                            utils.bar.update(1)
-                    finally:
-                        if utils.num_shards is None or utils.current_shard >= utils.num_shards:
-                            utils.bar.close()
-                            utils.bar = None
-                        lazy_load_callback.nested = False
-                        if isinstance(f, zipfile.ZipExtFile):
-                            f.close()
-
-            lazy_load_callback.nested = False
-            return lazy_load_callback
-
-        lazy_load_config_path = os.path.join("maps", vars.model_type + ".json")
-        if(vars.lazy_load and "model_config" in globals() and os.path.isfile(lazy_load_config_path)):
-            with open(lazy_load_config_path) as f:
-                lazy_load_spec = json.load(f)
-
-        else:
-            vars.lazy_load = False
-
-        # Some versions of transformers 4.17.0.dev0 are affected by
-        # https://github.com/huggingface/transformers/issues/15736
-        # This is a workaround for those versions of transformers.
-        if(transformers_version == "4.17.0.dev0"):
-            try:
-                from transformers.models.xglm.modeling_xglm import XGLMSinusoidalPositionalEmbedding
-            except ImportError:
-                pass
-            else:
-                @torch.no_grad()
-                def new_forward(self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0):
-                    bsz, seq_len = inputs_embeds.size()[:-1]
-                    input_shape = inputs_embeds.size()[:-1]
-                    sequence_length = input_shape[1]
-                    position_ids = torch.arange(
-                        past_key_values_length + self.padding_idx + 1, past_key_values_length + sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
-                    ).unsqueeze(0).expand(input_shape).contiguous()
-                    max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
-                    if max_pos > self.weights.size(0):
-                        self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
-                    return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
-                XGLMSinusoidalPositionalEmbedding.forward = new_forward
-
-        # Patch transformers to use our soft prompt
-        def patch_causallm(cls):
-            old_forward = cls.forward
-            def new_causallm_forward(self, *args, **kwargs):
-                input_ids = kwargs.get('input_ids').to(self.device)
-                assert input_ids is not None
-                kwargs['input_ids'] = None
-                if(vars.sp is not None):
-                    shifted_input_ids = input_ids - self.config.vocab_size
-                input_ids.clamp_(max=self.config.vocab_size-1)
-                if(hasattr(self, "transformer")):
-                    inputs_embeds = self.transformer.wte(input_ids)
-                elif(not hasattr(self.model, "decoder")):
-                    inputs_embeds = self.model.embed_tokens(input_ids)
-                else:
-                    inputs_embeds = self.model.decoder.embed_tokens(input_ids)
-                if(vars.sp is not None):
-                    vars.sp = vars.sp.to(inputs_embeds.dtype).to(inputs_embeds.device)
-                    inputs_embeds = torch.where(
-                        (shifted_input_ids >= 0)[..., None],
-                        vars.sp[shifted_input_ids.clamp(min=0)],
-                        inputs_embeds,
-                    )
-                if(hasattr(self, "model") and hasattr(self.model, "embed_scale")):
-                    inputs_embeds *= self.model.embed_scale
-                kwargs['inputs_embeds'] = inputs_embeds
-                return old_forward(self, *args, **kwargs)
-            cls.forward = new_causallm_forward
-        for cls in (GPT2LMHeadModel, GPTNeoForCausalLM):
-            patch_causallm(cls)
-        for c in ("GPTJForCausalLM", "XGLMForCausalLM", "OPTForCausalLM"):
-            try:
-                patch_causallm(getattr(__import__("transformers"), c))
-            except:
-                pass
-
-
-        # Fix a bug in OPTForCausalLM where self.lm_head is the wrong size
-        if(packaging.version.parse("4.19.0.dev0") <= packaging.version.parse(transformers_version) < packaging.version.parse("4.20.0")):
-            try:
-                from transformers import OPTForCausalLM, OPTModel
-            except ImportError:
-                pass
-            else:
-                # This is the same as the original __init__ but with
-                # config.hidden_size
-                # replaced with
-                # config.word_embed_proj_dim
-                def new_init(self, config):
-                    super(OPTForCausalLM, self).__init__(config)
-                    self.model = OPTModel(config)
-                    self.lm_head = torch.nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False)
-                    self.post_init()
-                OPTForCausalLM.__init__ = new_init
-
-
-        # Patch transformers to use our custom logit warpers
-        from transformers import LogitsProcessorList, LogitsWarper, LogitsProcessor, TopKLogitsWarper, TopPLogitsWarper, TemperatureLogitsWarper, RepetitionPenaltyLogitsProcessor
-        from warpers import AdvancedRepetitionPenaltyLogitsProcessor, TailFreeLogitsWarper, TypicalLogitsWarper, TopALogitsWarper
-
-        def dynamic_processor_wrap(cls, field_name, var_name, cond=None):
-            old_call = cls.__call__
-            def new_call(self, *args, **kwargs):
-                if(not isinstance(field_name, str) and isinstance(field_name, Iterable)):
-                    conds = []
-                    for f, v in zip(field_name, var_name):
-                        conds.append(getattr(vars, v))
-                        setattr(self, f, conds[-1])
-                else:
-                    conds = getattr(vars, var_name)
-                    setattr(self, field_name, conds)
-                assert len(args) == 2
-                if(cond is None or cond(conds)):
-                    return old_call(self, *args, **kwargs)
-                return args[1]
-            cls.__call__ = new_call
-        dynamic_processor_wrap(AdvancedRepetitionPenaltyLogitsProcessor, ("penalty", "penalty_slope", "penalty_range"), ("rep_pen", "rep_pen_slope", "rep_pen_range"), cond=lambda x: x[0] != 1.0)
-        dynamic_processor_wrap(TopKLogitsWarper, "top_k", "top_k", cond=lambda x: x > 0)
-        dynamic_processor_wrap(TopALogitsWarper, "top_a", "top_a", cond=lambda x: x > 0.0)
-        dynamic_processor_wrap(TopPLogitsWarper, "top_p", "top_p", cond=lambda x: x < 1.0)
-        dynamic_processor_wrap(TailFreeLogitsWarper, "tfs", "tfs", cond=lambda x: x < 1.0)
-        dynamic_processor_wrap(TypicalLogitsWarper, "typical", "typical", cond=lambda x: x < 1.0)
-        dynamic_processor_wrap(TemperatureLogitsWarper, "temperature", "temp", cond=lambda x: x != 1.0)
-        RepetitionPenaltyLogitsProcessor.__init__ = AdvancedRepetitionPenaltyLogitsProcessor.__init__
-        RepetitionPenaltyLogitsProcessor.__call__ = AdvancedRepetitionPenaltyLogitsProcessor.__call__
-
-        class LuaLogitsProcessor(LogitsProcessor):
-
-            def __init__(self):
-                pass
-
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-                assert scores.ndim == 2
-                assert input_ids.ndim == 2
-                self.regeneration_required = False
-                self.halt = False
-
-                scores_shape = scores.shape
-                scores_list = scores.tolist()
-                vars.lua_koboldbridge.logits = vars.lua_state.table()
-                for r, row in enumerate(scores_list):
-                    vars.lua_koboldbridge.logits[r+1] = vars.lua_state.table(*row)
-                vars.lua_koboldbridge.vocab_size = scores_shape[-1]
-
-                execute_genmod()
-
-                scores = torch.tensor(
-                    tuple(tuple(row.values()) for row in vars.lua_koboldbridge.logits.values()),
-                    device=scores.device,
-                    dtype=scores.dtype,
-                )
-                assert scores.shape == scores_shape
-
-                return scores
-        
-        def new_get_logits_processor(*args, **kwargs) -> LogitsProcessorList:
-            processors = new_get_logits_processor.old_get_logits_processor(*args, **kwargs)
-            processors.insert(0, LuaLogitsProcessor())
-            return processors
-        new_get_logits_processor.old_get_logits_processor = transformers.generation_utils.GenerationMixin._get_logits_processor
-        transformers.generation_utils.GenerationMixin._get_logits_processor = new_get_logits_processor
-
-        class KoboldLogitsWarperList(LogitsProcessorList):
-            def __init__(self, beams: int = 1, **kwargs):
-                self.__warper_list: List[LogitsWarper] = []
-                self.__warper_list.append(TopKLogitsWarper(top_k=1, min_tokens_to_keep=1 + (beams > 1)))
-                self.__warper_list.append(TopALogitsWarper(top_a=0.5, min_tokens_to_keep=1 + (beams > 1)))
-                self.__warper_list.append(TopPLogitsWarper(top_p=0.5, min_tokens_to_keep=1 + (beams > 1)))
-                self.__warper_list.append(TailFreeLogitsWarper(tfs=0.5, min_tokens_to_keep=1 + (beams > 1)))
-                self.__warper_list.append(TypicalLogitsWarper(typical=0.5, min_tokens_to_keep=1 + (beams > 1)))
-                self.__warper_list.append(TemperatureLogitsWarper(temperature=0.5))
-
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, *args, **kwargs):
-                for k in vars.sampler_order:
-                    scores = self.__warper_list[k](input_ids, scores, *args, **kwargs)
-                return scores
-
-        def new_get_logits_warper(beams: int = 1,) -> LogitsProcessorList:
-            return KoboldLogitsWarperList(beams=beams)
-        
-        def new_sample(self, *args, **kwargs):
-            assert kwargs.pop("logits_warper", None) is not None
-            kwargs["logits_warper"] = new_get_logits_warper(
-                beams=1,
+            scores = np.array(
+                tuple(tuple(row.values()) for row in vars.lua_koboldbridge.logits.values()),
+                dtype=scores.dtype,
             )
-            if(vars.newlinemode == "s") or (vars.newlinemode == "ns"):
-                kwargs["eos_token_id"] = -1
-                kwargs.setdefault("pad_token_id", 2)
-            return new_sample.old_sample(self, *args, **kwargs)
-        new_sample.old_sample = transformers.generation_utils.GenerationMixin.sample
-        transformers.generation_utils.GenerationMixin.sample = new_sample
+            assert scores.shape == scores_shape
 
-
-        # Allow bad words filter to ban <|endoftext|> token
-        import transformers.generation_logits_process
-        def new_init(self, bad_words_ids: List[List[int]], eos_token_id: int):
-            return new_init.old_init(self, bad_words_ids, -1)
-        new_init.old_init = transformers.generation_logits_process.NoBadWordsLogitsProcessor.__init__
-        transformers.generation_logits_process.NoBadWordsLogitsProcessor.__init__ = new_init
-
-
-        # Sets up dynamic world info scanner
-        class DynamicWorldInfoScanCriteria(StoppingCriteria):
-            def __init__(
-                self,
-                tokenizer,
-                excluded_world_info: List[Set],
-            ):
-                self.regeneration_required = False
-                self.halt = False
-                self.tokenizer = tokenizer
-                self.excluded_world_info = excluded_world_info
-            def __call__(
-                self,
-                input_ids: torch.LongTensor,
-                scores: torch.FloatTensor,
-                **kwargs,
-            ) -> bool:
-                vars.generated_tkns += 1
-                if(vars.lua_koboldbridge.generated_cols and vars.generated_tkns != vars.lua_koboldbridge.generated_cols):
-                    raise RuntimeError(f"Inconsistency detected between KoboldAI Python and Lua backends ({vars.generated_tkns} != {vars.lua_koboldbridge.generated_cols})")
-                if(vars.abort or vars.generated_tkns >= vars.genamt):
-                    self.regeneration_required = False
-                    self.halt = False
-                    return True
-
-                assert input_ids.ndim == 2
-                assert len(self.excluded_world_info) == input_ids.shape[0]
-                self.regeneration_required = vars.lua_koboldbridge.regeneration_required
-                self.halt = not vars.lua_koboldbridge.generating
-                vars.lua_koboldbridge.regeneration_required = False
-
-                for i in range(vars.numseqs):
-                    vars.lua_koboldbridge.generated[i+1][vars.generated_tkns] = int(input_ids[i, -1].item())
-
-                if(not vars.dynamicscan):
-                    return self.regeneration_required or self.halt
-                tail = input_ids[..., -vars.generated_tkns:]
-                for i, t in enumerate(tail):
-                    decoded = utils.decodenewlines(tokenizer.decode(t))
-                    _, found = checkworldinfo(decoded, force_use_txt=True, actions=vars._actions)
-                    found -= self.excluded_world_info[i]
-                    if(len(found) != 0):
-                        self.regeneration_required = True
-                        break
-                return self.regeneration_required or self.halt
-        old_get_stopping_criteria = transformers.generation_utils.GenerationMixin._get_stopping_criteria
-        def new_get_stopping_criteria(self, *args, **kwargs):
-            stopping_criteria = old_get_stopping_criteria(self, *args, **kwargs)
-            global tokenizer
-            self.kai_scanner = DynamicWorldInfoScanCriteria(
-                tokenizer=tokenizer,
-                excluded_world_info=self.kai_scanner_excluded_world_info,
-            )
-            stopping_criteria.insert(0, self.kai_scanner)
-            return stopping_criteria
-        transformers.generation_utils.GenerationMixin._get_stopping_criteria = new_get_stopping_criteria
-
-        def get_hidden_size_from_model(model):
-            try:
-                return int(model.model.decoder.project_in.in_features)
-            except:
-                try:
-                    return int(model.model.decoder.embed_tokens.out_features)
-                except:
-                    try:
-                        return int(model.transformer.hidden_size)
-                    except:
-                        try:
-                            return int(model.transformer.embed_dim)
-                        except:
-                            return int(model.lm_head.in_features)
+            return scores
         
-        def maybe_low_cpu_mem_usage() -> Dict[str, Any]:
-            if(packaging.version.parse(transformers_version) < packaging.version.parse("4.11.0")):
-                print(f"\nWARNING:  Please upgrade to transformers 4.11.0 for lower RAM usage.  You have transformers {transformers_version}.", file=sys.stderr)
-                return {}
-            return {"low_cpu_mem_usage": True}
-        
-        @contextlib.contextmanager
-        def maybe_use_float16(always_use=False):
-            if(always_use or (vars.hascuda and args.lowmem and (vars.usegpu or vars.breakmodel))):
-                original_dtype = torch.get_default_dtype()
-                torch.set_default_dtype(torch.float16)
-                yield True
-                torch.set_default_dtype(original_dtype)
-            else:
-                yield False
+        def tpumtjgenerate_stopping_callback(generated, n_generated, excluded_world_info) -> Tuple[List[set], bool, bool]:
+            vars.generated_tkns += 1
 
-        # If custom GPT2 model was chosen
-        if(vars.model == "GPT2Custom"):
-            vars.lazy_load = False
-            model_config = open(vars.custmodpth + "/config.json", "r")
-            js   = json.load(model_config)
-            with(maybe_use_float16()):
-                model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
-            tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
-            vars.modeldim = get_hidden_size_from_model(model)
-            # Is CUDA available? If so, use GPU, otherwise fall back to CPU
-            if(vars.hascuda and vars.usegpu):
-                model = model.half().to(vars.gpu_device)
-                generator = model.generate
-            else:
-                model = model.to('cpu').float()
-                generator = model.generate
-        # Use the Generic implementation
-        else:
-            lowmem = maybe_low_cpu_mem_usage()
-            # We must disable low_cpu_mem_usage (by setting lowmem to {}) if
-            # using a GPT-2 model because GPT-2 is not compatible with this
-            # feature yet
-            if(vars.model_type == "gpt2"):
-                lowmem = {}
-            
-            # If we're using torch_lazy_loader, we need to get breakmodel config
-            # early so that it knows where to load the individual model tensors
-            if(vars.lazy_load and vars.hascuda and vars.breakmodel):
-                device_config(model_config)
+            assert len(excluded_world_info) == len(generated)
+            regeneration_required = vars.lua_koboldbridge.regeneration_required
+            halt = vars.abort or not vars.lua_koboldbridge.generating or vars.generated_tkns >= vars.genamt
+            vars.lua_koboldbridge.regeneration_required = False
 
-            # Download model from Huggingface if it does not exist, otherwise load locally
-            
-            #If we specify a model and it's in the root directory, we need to move it to the models directory (legacy folder structure to new)
-            if os.path.isdir(vars.model.replace('/', '_')):
-                import shutil
-                shutil.move(vars.model.replace('/', '_'), "models/{}".format(vars.model.replace('/', '_')))
-            print("\n", flush=True)
-            with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if vars.lazy_load else None, dematerialized_modules=True):
-                if(vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
-                    lowmem = {}
-                if(os.path.isdir(vars.custmodpth)):
-                    try:
-                        tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
-                    except Exception as e:
-                        try:
-                            tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
-                        except Exception as e:
-                            tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
-                    try:
-                        model     = AutoModelForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
-                    except Exception as e:
-                        model     = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
-                elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
-                    try:
-                        tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
-                    except Exception as e:
-                        try:
-                            tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
-                        except Exception as e:
-                            tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
-                    try:
-                        model     = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem)
-                    except Exception as e:
-                        model     = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem)
-                else:
-                    old_rebuild_tensor = torch._utils._rebuild_tensor
-                    def new_rebuild_tensor(storage: Union[torch_lazy_loader.LazyTensor, torch.Storage], storage_offset, shape, stride):
-                        if(not isinstance(storage, torch_lazy_loader.LazyTensor)):
-                            dtype = storage.dtype
-                        else:
-                            dtype = storage.storage_type.dtype
-                            if(not isinstance(dtype, torch.dtype)):
-                                dtype = storage.storage_type(0).dtype
-                        if(dtype is torch.float32 and len(shape) >= 2):
-                            vars.fp32_model = True
-                        return old_rebuild_tensor(storage, storage_offset, shape, stride)
-                    torch._utils._rebuild_tensor = new_rebuild_tensor
+            global past
 
-                    try:
-                        tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
-                    except Exception as e:
-                        try:
-                            tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
-                        except Exception as e:
-                            tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
-                    try:
-                        model     = AutoModelForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem)
-                    except Exception as e:
-                        model     = GPTNeoForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem)
+            for i in range(vars.numseqs):
+                vars.lua_koboldbridge.generated[i+1][vars.generated_tkns] = int(generated[i, tpu_mtj_backend.params["seq"] + n_generated - 1].item())
 
-                    torch._utils._rebuild_tensor = old_rebuild_tensor
+            if(not vars.dynamicscan or halt):
+                return excluded_world_info, regeneration_required, halt
 
-                    if not args.colab or args.savemodel:
-                        import shutil
-                        tokenizer.save_pretrained("models/{}".format(vars.model.replace('/', '_')))
-                        if(vars.fp32_model):  # Use save_pretrained to convert fp32 models to fp16
-                            model = model.half()
-                            model.save_pretrained("models/{}".format(vars.model.replace('/', '_')), max_shard_size="500MiB")
-                        else:  # For fp16 models, we can just copy the model files directly
-                            import transformers.configuration_utils
-                            import transformers.modeling_utils
-                            import transformers.file_utils
-                            # Save the config.json
-                            shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, transformers.configuration_utils.CONFIG_NAME, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.configuration_utils.CONFIG_NAME))
-                            if(utils.num_shards is None):
-                                # Save the pytorch_model.bin of an unsharded model
-                                shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, transformers.modeling_utils.WEIGHTS_NAME, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.modeling_utils.WEIGHTS_NAME))
-                            else:
-                                with open(utils.from_pretrained_index_filename) as f:
-                                    map_data = json.load(f)
-                                filenames = set(map_data["weight_map"].values())
-                                # Save the pytorch_model.bin.index.json of a sharded model
-                                shutil.move(utils.from_pretrained_index_filename, os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.modeling_utils.WEIGHTS_INDEX_NAME))
-                                # Then save the pytorch_model-#####-of-#####.bin files
-                                for filename in filenames:
-                                    shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, filename, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), filename))
-                        shutil.rmtree("cache/")
-            
-            if(vars.hascuda):
-                if(vars.usegpu):
-                    vars.modeldim = get_hidden_size_from_model(model)
-                    model = model.half().to(vars.gpu_device)
-                    generator = model.generate
-                elif(vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
-                    vars.modeldim = get_hidden_size_from_model(model)
-                    if(not vars.lazy_load):
-                        device_config(model.config)
-                    move_model_to_devices(model)
-                else:
-                    model = model.to('cpu').float()
-                    vars.modeldim = get_hidden_size_from_model(model)
-                    generator = model.generate
-            else:
-                model.to('cpu').float()
-                vars.modeldim = get_hidden_size_from_model(model)
-                generator = model.generate
-        
-        # Suppress Author's Note by flagging square brackets (Old implementation)
-        #vocab         = tokenizer.get_vocab()
-        #vocab_keys    = vocab.keys()
-        #vars.badwords = gettokenids("[")
-        #for key in vars.badwords:
-        #    vars.badwordsids.append([vocab[key]])
-		
-        print("{0}OK! {1} pipeline created!{2}".format(colors.GREEN, vars.model, colors.END))
-    
-    else:
-        from transformers import GPT2TokenizerFast
-        tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
-else:
-    from transformers import PreTrainedModel
-    from transformers import modeling_utils
-    old_from_pretrained = PreTrainedModel.from_pretrained.__func__
-    @classmethod
-    def new_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        vars.fp32_model = False
-        utils.num_shards = None
-        utils.current_shard = 0
-        utils.from_pretrained_model_name = pretrained_model_name_or_path
-        utils.from_pretrained_index_filename = None
-        utils.from_pretrained_kwargs = kwargs
-        utils.bar = None
-        if not args.no_aria2:
-            utils.aria2_hook(pretrained_model_name_or_path, **kwargs)
-        return old_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
-    PreTrainedModel.from_pretrained = new_from_pretrained
-    if(hasattr(modeling_utils, "get_checkpoint_shard_files")):
-        old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files
-        def new_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs):
-            utils.num_shards = utils.get_num_shards(index_filename)
-            utils.from_pretrained_index_filename = index_filename
-            return old_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs)
-        modeling_utils.get_checkpoint_shard_files = new_get_checkpoint_shard_files
-
-    def tpumtjgetsofttokens():
-        soft_tokens = None
-        if(vars.sp is None):
-            global np
-            if 'np' not in globals():
-                import numpy as np
-            tensor = np.zeros((1, tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"])), dtype=np.float32)
-            rows = tensor.shape[0]
-            padding_amount = tpu_mtj_backend.params["seq"] - (tpu_mtj_backend.params["seq"] % -tpu_mtj_backend.params["cores_per_replica"]) - rows
-            tensor = np.pad(tensor, ((0, padding_amount), (0, 0)))
-            tensor = tensor.reshape(
-                tpu_mtj_backend.params["cores_per_replica"],
-                -1,
-                tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"]),
-            )
-            vars.sp = tpu_mtj_backend.shard_xmap(tensor)
-        soft_tokens = np.arange(
-            tpu_mtj_backend.params["n_vocab"] + tpu_mtj_backend.params["n_vocab_padding"],
-            tpu_mtj_backend.params["n_vocab"] + tpu_mtj_backend.params["n_vocab_padding"] + vars.sp_length,
-            dtype=np.uint32
-        )
-        return soft_tokens
-
-    def tpumtjgenerate_warper_callback(scores) -> "np.array":
-        scores_shape = scores.shape
-        scores_list = scores.tolist()
-        vars.lua_koboldbridge.logits = vars.lua_state.table()
-        for r, row in enumerate(scores_list):
-            vars.lua_koboldbridge.logits[r+1] = vars.lua_state.table(*row)
-        vars.lua_koboldbridge.vocab_size = scores_shape[-1]
-
-        execute_genmod()
-
-        scores = np.array(
-            tuple(tuple(row.values()) for row in vars.lua_koboldbridge.logits.values()),
-            dtype=scores.dtype,
-        )
-        assert scores.shape == scores_shape
-
-        return scores
-    
-    def tpumtjgenerate_stopping_callback(generated, n_generated, excluded_world_info) -> Tuple[List[set], bool, bool]:
-        vars.generated_tkns += 1
-
-        assert len(excluded_world_info) == len(generated)
-        regeneration_required = vars.lua_koboldbridge.regeneration_required
-        halt = vars.abort or not vars.lua_koboldbridge.generating or vars.generated_tkns >= vars.genamt
-        vars.lua_koboldbridge.regeneration_required = False
-
-        global past
-
-        for i in range(vars.numseqs):
-            vars.lua_koboldbridge.generated[i+1][vars.generated_tkns] = int(generated[i, tpu_mtj_backend.params["seq"] + n_generated - 1].item())
-
-        if(not vars.dynamicscan or halt):
+            for i, t in enumerate(generated):
+                decoded = utils.decodenewlines(tokenizer.decode(past[i])) + utils.decodenewlines(tokenizer.decode(t[tpu_mtj_backend.params["seq"] : tpu_mtj_backend.params["seq"] + n_generated]))
+                _, found = checkworldinfo(decoded, force_use_txt=True, actions=vars._actions)
+                found -= excluded_world_info[i]
+                if(len(found) != 0):
+                    regeneration_required = True
+                    break
             return excluded_world_info, regeneration_required, halt
 
-        for i, t in enumerate(generated):
-            decoded = utils.decodenewlines(tokenizer.decode(past[i])) + utils.decodenewlines(tokenizer.decode(t[tpu_mtj_backend.params["seq"] : tpu_mtj_backend.params["seq"] + n_generated]))
-            _, found = checkworldinfo(decoded, force_use_txt=True, actions=vars._actions)
-            found -= excluded_world_info[i]
-            if(len(found) != 0):
-                regeneration_required = True
-                break
-        return excluded_world_info, regeneration_required, halt
+        def tpumtjgenerate_compiling_callback() -> None:
+            print(colors.GREEN + "TPU backend compilation triggered" + colors.END)
+            vars.compiling = True
 
-    def tpumtjgenerate_compiling_callback() -> None:
-        print(colors.GREEN + "TPU backend compilation triggered" + colors.END)
-        vars.compiling = True
+        def tpumtjgenerate_stopped_compiling_callback() -> None:
+            vars.compiling = False
+        
+        def tpumtjgenerate_settings_callback() -> dict:
+            return {
+                "sampler_order": vars.sampler_order,
+                "top_p": float(vars.top_p),
+                "temp": float(vars.temp),
+                "top_k": int(vars.top_k),
+                "tfs": float(vars.tfs),
+                "typical": float(vars.typical),
+                "top_a": float(vars.top_a),
+                "repetition_penalty": float(vars.rep_pen),
+                "rpslope": float(vars.rep_pen_slope),
+                "rprange": int(vars.rep_pen_range),
+            }
 
-    def tpumtjgenerate_stopped_compiling_callback() -> None:
-        vars.compiling = False
+        # If we're running Colab or OAI, we still need a tokenizer.
+        if(vars.model == "Colab"):
+            from transformers import GPT2TokenizerFast
+            tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-neo-2.7B", revision=vars.revision, cache_dir="cache")
+            loadsettings()
+        elif(vars.model == "OAI"):
+            from transformers import GPT2TokenizerFast
+            tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+            loadsettings()
+        # Load the TPU backend if requested
+        elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
+            global tpu_mtj_backend
+            import tpu_mtj_backend
+            if(vars.model == "TPUMeshTransformerGPTNeoX"):
+                vars.badwordsids = vars.badwordsids_neox
+            print("{0}Initializing Mesh Transformer JAX, please wait...{1}".format(colors.PURPLE, colors.END))
+            if vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") and (not vars.custmodpth or not os.path.isdir(vars.custmodpth)):
+                raise FileNotFoundError(f"The specified model path {repr(vars.custmodpth)} is not the path to a valid folder")
+            import tpu_mtj_backend
+            if(vars.model == "TPUMeshTransformerGPTNeoX" or vars.model_type == "opt"):
+                tpu_mtj_backend.pad_token_id = 1
+            tpu_mtj_backend.vars = vars
+            tpu_mtj_backend.warper_callback = tpumtjgenerate_warper_callback
+            tpu_mtj_backend.stopping_callback = tpumtjgenerate_stopping_callback
+            tpu_mtj_backend.compiling_callback = tpumtjgenerate_compiling_callback
+            tpu_mtj_backend.stopped_compiling_callback = tpumtjgenerate_stopped_compiling_callback
+            tpu_mtj_backend.settings_callback = tpumtjgenerate_settings_callback
+            vars.allowsp = True
+            loadmodelsettings()
+            loadsettings()
+            tpu_mtj_backend.load_model(vars.custmodpth, hf_checkpoint=vars.model not in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") and vars.use_colab_tpu, **vars.modelconfig)
+            vars.modeldim = int(tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"]))
+            tokenizer = tpu_mtj_backend.tokenizer
+        else:
+            loadsettings()
     
-    def tpumtjgenerate_settings_callback() -> dict:
-        return {
-            "sampler_order": vars.sampler_order,
-            "top_p": float(vars.top_p),
-            "temp": float(vars.temp),
-            "top_k": int(vars.top_k),
-            "tfs": float(vars.tfs),
-            "typical": float(vars.typical),
-            "top_a": float(vars.top_a),
-            "repetition_penalty": float(vars.rep_pen),
-            "rpslope": float(vars.rep_pen_slope),
-            "rprange": int(vars.rep_pen_range),
-        }
+    lua_startup()
+    # Load scripts
+    load_lua_scripts()
+    
+    final_startup()
+    if not initial_load:
+        set_aibusy(False)
+        emit('from_server', {'cmd': 'hide_model_name'}, broadcast=True)
+        time.sleep(0.1)
+        
+        if not vars.gamestarted:
+            setStartState()
+            sendsettings()
+            refresh_settings()
 
-    # If we're running Colab or OAI, we still need a tokenizer.
-    if(vars.model == "Colab"):
-        from transformers import GPT2TokenizerFast
-        tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-neo-2.7B", revision=vars.revision, cache_dir="cache")
-        loadsettings()
-    elif(vars.model == "OAI"):
-        from transformers import GPT2TokenizerFast
-        tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
-        loadsettings()
-    # Load the TPU backend if requested
-    elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
-        if(vars.model == "TPUMeshTransformerGPTNeoX"):
-            vars.badwordsids = vars.badwordsids_neox
-        print("{0}Initializing Mesh Transformer JAX, please wait...{1}".format(colors.PURPLE, colors.END))
-        if vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") and (not vars.custmodpth or not os.path.isdir(vars.custmodpth)):
-            raise FileNotFoundError(f"The specified model path {repr(vars.custmodpth)} is not the path to a valid folder")
-        import tpu_mtj_backend
-        if(vars.model == "TPUMeshTransformerGPTNeoX" or vars.model_type == "opt"):
-            tpu_mtj_backend.pad_token_id = 1
-        tpu_mtj_backend.vars = vars
-        tpu_mtj_backend.warper_callback = tpumtjgenerate_warper_callback
-        tpu_mtj_backend.stopping_callback = tpumtjgenerate_stopping_callback
-        tpu_mtj_backend.compiling_callback = tpumtjgenerate_compiling_callback
-        tpu_mtj_backend.stopped_compiling_callback = tpumtjgenerate_stopped_compiling_callback
-        tpu_mtj_backend.settings_callback = tpumtjgenerate_settings_callback
-        vars.allowsp = True
-        loadmodelsettings()
-        loadsettings()
-        tpu_mtj_backend.load_model(vars.custmodpth, hf_checkpoint=vars.model not in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") and vars.use_colab_tpu, **vars.modelconfig)
-        vars.modeldim = int(tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"]))
-        tokenizer = tpu_mtj_backend.tokenizer
-    else:
-        loadsettings()
 
 # Set up Flask routes
 @app.route('/')
 @app.route('/index')
 def index():
-    return render_template('index.html')
+    if 'new_ui' in request.args:
+        return render_template('index_new.html', hide_ai_menu=args.noaimenu)
+    else:
+        return render_template('index.html', hide_ai_menu=args.noaimenu)
+@app.route('/favicon.ico')
+def favicon():
+    return send_from_directory(app.root_path,
+                                   'koboldai.ico', mimetype='image/vnd.microsoft.icon')    
 @app.route('/download')
 def download():
     save_format = request.args.get("format", "json").strip().lower()
@@ -1927,29 +2107,67 @@ def download():
 
 
 #============================ LUA API =============================#
+_bridged = {}
+F = TypeVar("F", bound=Callable)
+def lua_startup():
+    global _bridged
+    global F
+    global bridged
+    if(path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")):
+        file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r")
+        js   = json.load(file)
+        if("userscripts" in js):
+            vars.userscripts = []
+            for userscript in js["userscripts"]:
+                if type(userscript) is not str:
+                    continue
+                userscript = userscript.strip()
+                if len(userscript) != 0 and all(q not in userscript for q in ("..", ":")) and all(userscript[0] not in q for q in ("/", "\\")) and os.path.exists(fileops.uspath(userscript)):
+                    vars.userscripts.append(userscript)
+        if("corescript" in js and type(js["corescript"]) is str and all(q not in js["corescript"] for q in ("..", ":")) and all(js["corescript"][0] not in q for q in ("/", "\\"))):
+            vars.corescript = js["corescript"]
+        else:
+            vars.corescript = "default.lua"
+        file.close()
+        
+    #==================================================================#
+    #  Lua runtime startup
+    #==================================================================#
+
+    print("", end="", flush=True)
+    print(colors.PURPLE + "Initializing Lua Bridge... " + colors.END, end="", flush=True)
+
+    # Set up Lua state
+    vars.lua_state = lupa.LuaRuntime(unpack_returned_tuples=True)
+
+    # Load bridge.lua
+    bridged = {
+        "corescript_path": "cores",
+        "userscript_path": "userscripts",
+        "config_path": "userscripts",
+        "lib_paths": vars.lua_state.table("lualibs", os.path.join("extern", "lualibs")),
+        "vars": vars,
+    }
+    for kwarg in _bridged:
+        bridged[kwarg] = _bridged[kwarg]
+    try:
+        vars.lua_kobold, vars.lua_koboldcore, vars.lua_koboldbridge = vars.lua_state.globals().dofile("bridge.lua")(
+            vars.lua_state.globals().python,
+            bridged,
+        )
+    except lupa.LuaError as e:
+        print(colors.RED + "ERROR!" + colors.END)
+        vars.lua_koboldbridge.obliterate_multiverse()
+        print("{0}{1}{2}".format(colors.RED, "***LUA ERROR***: ", colors.END), end="", file=sys.stderr)
+        print("{0}{1}{2}".format(colors.RED, str(e).replace("\033", ""), colors.END), file=sys.stderr)
+        exit(1)
+    print(colors.GREEN + "OK!" + colors.END)
 
-if(path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")):
-    file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r")
-    js   = json.load(file)
-    if("userscripts" in js):
-        vars.userscripts = []
-        for userscript in js["userscripts"]:
-            if type(userscript) is not str:
-                continue
-            userscript = userscript.strip()
-            if len(userscript) != 0 and all(q not in userscript for q in ("..", ":")) and all(userscript[0] not in q for q in ("/", "\\")) and os.path.exists(fileops.uspath(userscript)):
-                vars.userscripts.append(userscript)
-    if("corescript" in js and type(js["corescript"]) is str and all(q not in js["corescript"] for q in ("..", ":")) and all(js["corescript"][0] not in q for q in ("/", "\\"))):
-        vars.corescript = js["corescript"]
-    else:
-        vars.corescript = "default.lua"
-    file.close()
 
 def lua_log_format_name(name):
     return f"[{name}]" if type(name) is str else "CORE"
 
-_bridged = {}
-F = TypeVar("F", bound=Callable)
+
 def bridged_kwarg(name=None):
     def _bridged_kwarg(f: F):
         _bridged[name if name is not None else f.__name__[4:] if f.__name__[:4] == "lua_" else f.__name__] = f
@@ -2520,41 +2738,7 @@ def execute_outmod():
     for k in vars.lua_deleted:
         inlinedelete(k)
 
-#==================================================================#
-#  Lua runtime startup
-#==================================================================#
 
-print("", end="", flush=True)
-print(colors.PURPLE + "Initializing Lua Bridge... " + colors.END, end="", flush=True)
-
-# Set up Lua state
-vars.lua_state = lupa.LuaRuntime(unpack_returned_tuples=True)
-
-# Load bridge.lua
-bridged = {
-    "corescript_path": "cores",
-    "userscript_path": "userscripts",
-    "config_path": "userscripts",
-    "lib_paths": vars.lua_state.table("lualibs", os.path.join("extern", "lualibs")),
-    "vars": vars,
-}
-for kwarg in _bridged:
-    bridged[kwarg] = _bridged[kwarg]
-try:
-    vars.lua_kobold, vars.lua_koboldcore, vars.lua_koboldbridge = vars.lua_state.globals().dofile("bridge.lua")(
-        vars.lua_state.globals().python,
-        bridged,
-    )
-except lupa.LuaError as e:
-    print(colors.RED + "ERROR!" + colors.END)
-    vars.lua_koboldbridge.obliterate_multiverse()
-    print("{0}{1}{2}".format(colors.RED, "***LUA ERROR***: ", colors.END), end="", file=sys.stderr)
-    print("{0}{1}{2}".format(colors.RED, str(e).replace("\033", ""), colors.END), file=sys.stderr)
-    exit(1)
-print(colors.GREEN + "OK!" + colors.END)
-
-# Load scripts
-load_lua_scripts()
 
 
 #============================ METHODS =============================#    
@@ -2898,6 +3082,81 @@ def get_message(msg):
             raise ValueError(f"Sampler order must be a list of ints, but got a list with at least one non-int element")
         vars.sampler_order = sampler_order
         settingschanged()
+    elif(msg['cmd'] == 'list_model'):
+        sendModelSelection(menu=msg['data'])
+    elif(msg['cmd'] == 'load_model'):
+        if not os.path.exists("settings/"):
+            os.mkdir("settings")
+        changed = True
+        if os.path.exists("settings/" + vars.model.replace('/', '_') + ".breakmodel"):
+            with open("settings/" + vars.model.replace('/', '_') + ".breakmodel", "r") as file:
+                if file.read() == msg['gpu_layers']:
+                    changed = False
+        if changed:
+            f = open("settings/" + vars.model.replace('/', '_') + ".breakmodel", "w")
+            f.write(msg['gpu_layers'])
+            f.close()
+        vars.colaburl = msg['url'] + "/request"
+        load_model(use_gpu=msg['use_gpu'], gpu_layers=msg['gpu_layers'], online_model=msg['online_model'])
+    elif(msg['cmd'] == 'show_model'):
+        print("Model Name: {}".format(getmodelname()))
+        emit('from_server', {'cmd': 'show_model_name', 'data': getmodelname()}, broadcast=True)
+    elif(msg['cmd'] == 'selectmodel'):
+        # This is run when a model line is selected from the UI (line from the model_menu variable) that is tagged as not a menu
+        # otherwise we should be running the msg['cmd'] == 'list_model'
+        
+        # We have to do a bit of processing though, if we select a custom path, we need to list out the contents of folders
+        # But if we select something else, we need to potentially show model layers for each GPU
+        # We might also need to show key input. All of that happens here
+        
+        # The data variable will contain the model name. But our Custom lines need a bit more processing
+        # If we're on a custom line that we have selected a model for, the path variable will be in msg
+        # so if that's missing we need to run the menu to show the model folders in the models folder
+        if msg['data'] in ('NeoCustom', 'GPT2Custom') and 'path' not in msg and 'path_modelname' not in msg:
+            if 'folder' not in msg:
+                folder = "./models"
+            else:
+                folder = msg['folder']
+            sendModelSelection(menu=msg['data'], folder=folder)
+        elif msg['data'] in ('NeoCustom', 'GPT2Custom') and 'path_modelname' in msg:
+            #Here the user entered custom text in the text box. This could be either a model name or a path.
+            if check_if_dir_is_model(msg['path_modelname']):
+                vars.model = msg['data']
+                vars.custmodpth = msg['path_modelname']
+                get_model_info(msg['data'], directory=msg['path'])
+            else:
+                vars.model = msg['path_modelname']
+                try:
+                    get_model_info(vars.model)
+                except:
+                    emit('from_server', {'cmd': 'errmsg', 'data': "The model entered doesn't exist."})
+        elif msg['data'] in ('NeoCustom', 'GPT2Custom'):
+            if check_if_dir_is_model(msg['path']):
+                vars.model = msg['data']
+                vars.custmodpth = msg['path']
+                get_model_info(msg['data'], directory=msg['path'])
+            else:
+                sendModelSelection(menu=msg['data'], folder=msg['path'])
+        else:
+            vars.model = msg['data']
+            if 'path' in msg:
+                vars.custmodpth = msg['path']
+                get_model_info(msg['data'], directory=msg['path'])
+            else:
+                get_model_info(vars.model)
+    elif(msg['cmd'] == 'delete_model'):
+        if "{}/models".format(os.getcwd()) in os.path.abspath(msg['data']) or "{}\\models".format(os.getcwd()) in os.path.abspath(msg['data']):
+            if check_if_dir_is_model(msg['data']):
+                print("It's a model, now we really will kill it")
+                import shutil
+                shutil.rmtree(msg['data'])
+                sendModelSelection(menu=msg['menu'])
+            else:
+                print("Not a model, don't delete")
+        else:
+            print("Ah ah ah, you didn't say the magic word: The selected directory is not in the KoboldAI Models directory, not doing anything.")
+    elif(msg['cmd'] == 'OAI_Key_Update'):
+        get_oai_models(msg['key'])
     elif(msg['cmd'] == 'loadselect'):
         vars.loadselect = msg["data"]
     elif(msg['cmd'] == 'spselect'):
@@ -2983,7 +3242,7 @@ def sendUSStatItems():
 #  KoboldAI Markup Formatting (Mixture of Markdown and sanitized html)
 #==================================================================#
 def kml(txt):
-   txt = txt.replace('\>', '&gt;')
+   txt = txt.replace('>', '&gt;')
    txt = bleach.clean(markdown.markdown(txt), tags = ['p', 'em', 'strong', 'code', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'ul', 'b', 'i', 'a', 'span', 'button'], styles = ['color', 'font-weight'], attributes=['id', 'class', 'style', 'href'])
    return txt
 
@@ -3007,6 +3266,7 @@ def setStartState():
 #==================================================================#
 def sendsettings():
     # Send settings for selected AI type
+    emit('from_server', {'cmd': 'reset_menus'})
     if(vars.model != "InferKit"):
         for set in gensettings.gensettingstf:
             emit('from_server', {'cmd': 'addsetting', 'data': set})
@@ -5023,6 +5283,7 @@ def loadRequest(loadpath, filename=None):
         vars.lastact     = ""
         vars.submission  = ""
         vars.lastctx     = ""
+        vars.genseqs = []
 
         del vars.actions
         vars.actions = structures.KoboldStoryRegister()
@@ -5284,7 +5545,7 @@ def importgame():
         vars.importjs = {}
         
         # Reset current save
-        vars.savedir = getcwd()+"\stories"
+        vars.savedir = getcwd()+"\\stories"
         
         # Refresh game screen
         vars.laststory = None
@@ -5366,7 +5627,7 @@ def importAidgRequest(id):
         vars.worldinfo_i = [wi for wi in vars.worldinfo if wi["init"]]
 
         # Reset current save
-        vars.savedir = getcwd()+"\stories"
+        vars.savedir = getcwd()+"\\stories"
         
         # Refresh game screen
         vars.laststory = None
@@ -5459,7 +5720,7 @@ def newGameRequest():
     vars.lastctx     = ""
     
     # Reset current save
-    vars.savedir = getcwd()+"\stories"
+    vars.savedir = getcwd()+"\\stories"
     
     # Refresh game screen
     vars.laststory = None
@@ -5490,51 +5751,52 @@ def randomGameRequest(topic, memory=""):
     vars.memory      = memory
     emit('from_server', {'cmd': 'setmemory', 'data': vars.memory}, broadcast=True)
 
-# Prevent tokenizer from taking extra time the first time it's used
-def __preempt_tokenizer():
-    if("tokenizer" not in globals()):
-        return
-    utils.decodenewlines(tokenizer.decode([25678, 559]))
-    tokenizer.encode(utils.encodenewlines("eunoia"))
-threading.Thread(target=__preempt_tokenizer).start()
+def final_startup():
+    # Prevent tokenizer from taking extra time the first time it's used
+    def __preempt_tokenizer():
+        if("tokenizer" not in globals()):
+            return
+        utils.decodenewlines(tokenizer.decode([25678, 559]))
+        tokenizer.encode(utils.encodenewlines("eunoia"))
+    threading.Thread(target=__preempt_tokenizer).start()
 
-# Load soft prompt specified by the settings file, if applicable
-if(path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")):
-    file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r")
-    js   = json.load(file)
-    if(vars.allowsp and "softprompt" in js and type(js["softprompt"]) is str and all(q not in js["softprompt"] for q in ("..", ":")) and (len(js["softprompt"]) == 0 or all(js["softprompt"][0] not in q for q in ("/", "\\")))):
-        spRequest(js["softprompt"])
-    else:
-        vars.spfilename = ""
-    file.close()
+    # Load soft prompt specified by the settings file, if applicable
+    if(path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")):
+        file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r")
+        js   = json.load(file)
+        if(vars.allowsp and "softprompt" in js and type(js["softprompt"]) is str and all(q not in js["softprompt"] for q in ("..", ":")) and (len(js["softprompt"]) == 0 or all(js["softprompt"][0] not in q for q in ("/", "\\")))):
+            spRequest(js["softprompt"])
+        else:
+            vars.spfilename = ""
+        file.close()
 
-# Precompile TPU backend if required
-if(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
-    soft_tokens = tpumtjgetsofttokens()
-    if(vars.dynamicscan or (not vars.nogenmod and vars.has_genmod)):
-        threading.Thread(
-            target=tpu_mtj_backend.infer_dynamic,
-            args=(np.tile(np.uint32((23403, 727, 20185)), (vars.numseqs, 1)),),
-            kwargs={
-                "soft_embeddings": vars.sp,
-                "soft_tokens": soft_tokens,
-                "gen_len": 1,
-                "use_callback": False,
-                "numseqs": vars.numseqs,
-                "excluded_world_info": list(set() for _ in range(vars.numseqs)),
-            },
-        ).start()
-    else:
-        threading.Thread(
-            target=tpu_mtj_backend.infer_static,
-            args=(np.uint32((23403, 727, 20185)),),
-            kwargs={
-                "soft_embeddings": vars.sp,
-                "soft_tokens": soft_tokens,
-                "gen_len": 1,
-                "numseqs": vars.numseqs,
-            },
-        ).start()
+    # Precompile TPU backend if required
+    if(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
+        soft_tokens = tpumtjgetsofttokens()
+        if(vars.dynamicscan or (not vars.nogenmod and vars.has_genmod)):
+            threading.Thread(
+                target=tpu_mtj_backend.infer_dynamic,
+                args=(np.tile(np.uint32((23403, 727, 20185)), (vars.numseqs, 1)),),
+                kwargs={
+                    "soft_embeddings": vars.sp,
+                    "soft_tokens": soft_tokens,
+                    "gen_len": 1,
+                    "use_callback": False,
+                    "numseqs": vars.numseqs,
+                    "excluded_world_info": list(set() for _ in range(vars.numseqs)),
+                },
+            ).start()
+        else:
+            threading.Thread(
+                target=tpu_mtj_backend.infer_static,
+                args=(np.uint32((23403, 727, 20185)),),
+                kwargs={
+                    "soft_embeddings": vars.sp,
+                    "soft_tokens": soft_tokens,
+                    "gen_len": 1,
+                    "numseqs": vars.numseqs,
+                },
+            ).start()
 
 def send_debug():
     if vars.debug:
@@ -5575,11 +5837,18 @@ def send_debug():
 #==================================================================#
 print("", end="", flush=True)
 if __name__ == "__main__":
-    port = args.port if "port" in args and args.port is not None else 5000
     print("{0}\nStarting webserver...{1}".format(colors.GREEN, colors.END), flush=True)
 
-    # Start Flask/SocketIO (Blocking, so this must be last method!)
+    general_startup()
+    patch_transformers()
+    #show_select_model_list()
+    if vars.model == "" or vars.model is None:
+        vars.model = "ReadOnly"
+    load_model(initial_load=True)
 
+    # Start Flask/SocketIO (Blocking, so this must be last method!)
+    port = args.port if "port" in args and args.port is not None else 5000
+    
     #socketio.run(app, host='0.0.0.0', port=port)
     if(vars.host):
         if(args.localtunnel):
@@ -5589,7 +5858,7 @@ if __name__ == "__main__":
             while attempts < 10:
                 try:
                     cloudflare = str(localtunnel.stdout.readline())
-                    cloudflare = (re.search("(?P<url>https?:\/\/[^\s]+loca.lt)", cloudflare).group("url"))
+                    cloudflare = (re.search("(?P<url>https?://[^s]+loca.lt)", cloudflare).group("url"))
                     break
                 except:
                     attempts += 1
@@ -5626,4 +5895,10 @@ if __name__ == "__main__":
             socketio.run(app, port=port)
 
 else:
+    general_startup()
+    patch_transformers()
+    #show_select_model_list()
+    if vars.model == "" or vars.model is None:
+        vars.model = "ReadOnly"
+    load_model(initial_load=True)
     print("{0}\nServer started in WSGI mode!{1}".format(colors.GREEN, colors.END), flush=True)
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..c930ba37
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+addopts = --ignore=miniconda3 --ignore=runtime --html=unit_test_report.html --self-contained-html -v
\ No newline at end of file
diff --git a/readme.md b/readme.md
index a136c856..ee0d601b 100644
--- a/readme.md
+++ b/readme.md
@@ -221,7 +221,7 @@ This project contains work from the following contributors :
 *   The Gantian - Creator of KoboldAI, has created most features such as the interface, the different AI model / API integrations and in general the largest part of the project.
 *   VE FORBRYDERNE - Contributed many features such as the Editing overhaul, Adventure Mode, expansions to the world info section, breakmodel integration, scripting support, softpromtps and much more. As well as vastly improving the TPU compatibility and integrating external code into KoboldAI so we could use official versions of Transformers with virtually no downsides.
 *   Henk717 - Contributed the installation scripts, this readme, random story generator, the docker scripts, the foundation for the commandline interface and other smaller changes as well as integrating multiple parts of the code of different forks to unite it all. He also optimized the model loading so that downloaded models get converted to efficient offline models and that in future models are more likely to work out of the box. Not all code Github attributes to Henk717 is by Henk717 as some of it has been integrations of other people's work. We try to clarify this in the contributors list as much as we can.
-*   Ebolam - Automatic Saving
+*   Ebolam - Automatic Saving, back/redo, pinning, web loading of models
 *   Frogging101 - top\_k / tfs support (Part of this support was later redone by VE to integrate what was originally inside of finetuneanon's transformers)
 *   UWUplus (Ralf) - Contributed storage systems for community colabs, as well as cleaning up and integrating the website dependencies/code better. He is also the maintainer of flask-cloudflared which we use to generate the cloudflare links.
 *   Javalar - Initial Performance increases on the story\_refresh
@@ -238,4 +238,4 @@ Did we miss your contribution? Feel free to issue a commit adding your name to t
 
 KoboldAI is licensed with a AGPL license, in short this means that it can be used by anyone for any purpose. However, if you decide to make a publicly available instance your users are entitled to a copy of the source code including all modifications that you have made (which needs to be available trough an interface such as a button on your website), you may also not distribute this project in a form that does not contain the source code (Such as compiling / encrypting the code and distributing this version without also distributing the source code that includes the changes that you made. You are allowed to distribute this in a closed form if you also provide a separate archive with the source code.).
 
-umamba.exe is bundled for convenience because we observed that many of our users had trouble with command line download methods, it is not part of our project and does not fall under the AGPL license. It is licensed under the BSD-3-Clause license. Other files with differing licenses will have a reference or embedded version of this license within the file.
\ No newline at end of file
+umamba.exe is bundled for convenience because we observed that many of our users had trouble with command line download methods, it is not part of our project and does not fall under the AGPL license. It is licensed under the BSD-3-Clause license. Other files with differing licenses will have a reference or embedded version of this license within the file.
diff --git a/static/application.js b/static/application.js
index 3cddea87..a9c0d106 100644
--- a/static/application.js
+++ b/static/application.js
@@ -7,6 +7,7 @@ var socket;
 
 // UI references for jQuery
 var connect_status;
+var button_loadmodel;
 var button_newgame;
 var button_rndgame;
 var button_save;
@@ -56,6 +57,7 @@ var savepins;
 var topic;
 var saveas_accept;
 var saveas_close;
+var loadmodelpopup;
 var loadpopup;
 var	loadcontent;
 var	load_accept;
@@ -99,6 +101,7 @@ var remote = false;
 var gamestate = "";
 var gamesaved = true;
 var modelname = null;
+var model = "";
 
 // This is true iff [we're in macOS and the browser is Safari] or [we're in iOS]
 var using_webkit_patch = true;
@@ -152,6 +155,12 @@ function getThrottle(ms) {
     }
 }
 
+function reset_menus() {
+	settings_menu.html("");
+	format_menu.html("");
+	wi_menu.html("");
+}
+
 function addSetting(ob) {	
 	// Add setting block to Settings Menu
 	if(ob.uitype == "slider"){
@@ -877,6 +886,7 @@ function formatChunkInnerText(chunk) {
 }
 
 function dosubmit(disallow_abort) {
+	submit_start = Date.now();
 	var txt = input_text.val().replace(/\u00a0/g, " ");
 	if((disallow_abort || gamestate !== "wait") && !memorymode && !gamestarted && ((!adventure || !action_mode) && txt.trim().length == 0)) {
 		return;
@@ -947,6 +957,17 @@ function sendSaveAsRequest() {
 	socket.send({'cmd': 'saveasrequest', 'data': {"name": saveasinput.val(), "pins": savepins.val()}});
 }
 
+function showLoadModelPopup() {
+	loadmodelpopup.removeClass("hidden");
+	loadmodelpopup.addClass("flex");
+}
+
+function hideLoadModelPopup() {
+	loadmodelpopup.removeClass("flex");
+	loadmodelpopup.addClass("hidden");
+	loadmodelcontent.html("");
+}
+
 function showLoadPopup() {
 	loadpopup.removeClass("hidden");
 	loadpopup.addClass("flex");
@@ -990,6 +1011,92 @@ function hideSamplersPopup() {
 	samplerspopup.addClass("hidden");
 }
 
+
+function buildLoadModelList(ar, menu, breadcrumbs, showdelete) {
+	disableButtons([load_model_accept]);
+	loadmodelcontent.html("");
+	$("#loadmodellistbreadcrumbs").html("");
+	$("#custommodelname").addClass("hidden");
+	var i;
+	for(i=0; i<breadcrumbs.length; i++) {
+		$("#loadmodellistbreadcrumbs").append("<button class=\"breadcrumbitem\" id='model_breadcrumbs"+i+"' name='"+ar[0][1]+"' value='"+breadcrumbs[i][0]+"'>"+breadcrumbs[i][1]+"</button><font color=white>\\</font>");
+		$("#model_breadcrumbs"+i).off("click").on("click", (function () {
+				return function () {
+					socket.send({'cmd': 'selectmodel', 'data': $(this).attr("name"), 'folder': $(this).attr("value")});
+					disableButtons([load_model_accept]);
+				}
+			})(i));
+	}
+	if (breadcrumbs.length > 0) {
+		$("#loadmodellistbreadcrumbs").append("<hr size='1'>")  
+	}
+	for(i=0; i<ar.length; i++) {
+		if (Array.isArray(ar[i][0])) {
+			full_path = ar[i][0][0];
+			folder = ar[i][0][1];
+		} else {
+			full_path = "";
+			folder = ar[i][0];
+		}
+		
+		var html
+		html = "<div class=\"flex\">\
+			<div class=\"loadlistpadding\"></div>"
+		//if the menu item is a link to another menu
+		if(ar[i][3]) {
+			html = html + "<span class=\"loadlisticon loadmodellisticon-folder oi oi-folder allowed\"  aria-hidden=\"true\"></span>"
+		} else {
+		//this is a model
+			html = html + "<div class=\"loadlistpadding\"></div>"
+		}
+		
+		//now let's do the delete icon if applicable
+		if (['NeoCustom', 'GPT2Custom'].includes(menu) && !ar[i][3] && showdelete) {
+			html = html + "<span class=\"loadlisticon loadmodellisticon-folder oi oi-x allowed\"  aria-hidden=\"true\" onclick='if(confirm(\"This will delete the selected folder with all contents. Are you sure?\")) { socket.send({\"cmd\": \"delete_model\", \"data\": \""+full_path.replaceAll("\\", "\\\\")+"\", \"menu\": \""+menu+"\"});}'></span>"
+		} else {
+			html = html + "<div class=\"loadlistpadding\"></div>"
+		}
+		
+		html = html + "<div class=\"loadlistpadding\"></div>\
+						<div class=\"loadlistitem\" id=\"loadmodel"+i+"\" name=\""+ar[i][1]+"\" pretty_name=\""+full_path+"\">\
+							<div>"+folder+"</div>\
+							<div class=\"flex-push-right\">"+ar[i][2]+"</div>\
+						</div>\
+					</div>"
+		loadmodelcontent.append(html);
+		//If this is a menu
+		if(ar[i][3]) {
+			$("#loadmodel"+i).off("click").on("click", (function () {
+				return function () {
+					socket.send({'cmd': 'list_model', 'data': $(this).attr("name"), 'pretty_name': $(this).attr("pretty_name")});
+					disableButtons([load_model_accept]);
+				}
+			})(i));
+		//If we're in the custom load menu (we need to send the path data back in that case)
+		} else if(['NeoCustom', 'GPT2Custom'].includes(menu)) {
+			$("#loadmodel"+i).off("click").on("click", (function () {
+				return function () {
+					socket.send({'cmd': 'selectmodel', 'data': $(this).attr("name"), 'path': $(this).attr("pretty_name")});
+					highlightLoadLine($(this));
+				}
+			})(i));
+			$("#custommodelname").removeClass("hidden");
+			$("#custommodelname")[0].setAttribute("menu", menu);
+		//Normal load
+		} else {
+			$("#loadmodel"+i).off("click").on("click", (function () {
+				return function () {
+					$("#use_gpu_div").addClass("hidden");
+					$("#modelkey").addClass("hidden");
+					$("#modellayers").addClass("hidden");
+					socket.send({'cmd': 'selectmodel', 'data': $(this).attr("name")});
+					highlightLoadLine($(this));
+				}
+			})(i));
+		}
+	}
+}
+
 function buildLoadList(ar) {
 	disableButtons([load_accept]);
 	loadcontent.html("");
@@ -1148,6 +1255,7 @@ function buildSamplerList(samplers) {
 
 function highlightLoadLine(ref) {
 	$("#loadlistcontent > div > div.popuplistselected").removeClass("popuplistselected");
+	$("#loadmodellistcontent > div > div.popuplistselected").removeClass("popuplistselected");
 	ref.addClass("popuplistselected");
 }
 
@@ -1851,6 +1959,30 @@ function unbindGametext() {
 	gametext_bound = false;
 }
 
+function update_gpu_layers() {
+	var gpu_layers
+	gpu_layers = 0;
+	for (let i=0; i < $("#gpu_count")[0].value; i++) {
+		gpu_layers += parseInt($("#gpu_layers"+i)[0].value);
+		$("#gpu_layers_box_"+i)[0].value=$("#gpu_layers"+i)[0].value;
+	}
+	if (gpu_layers > parseInt(document.getElementById("gpu_layers_max").innerHTML)) {
+		disableButtons([load_model_accept]);
+		$("#gpu_layers_current").html("<span style='color: red'>"+gpu_layers+"/"+ document.getElementById("gpu_layers_max").innerHTML +"</span>");
+	} else {
+		enableButtons([load_model_accept]);
+		$("#gpu_layers_current").html(gpu_layers+"/"+document.getElementById("gpu_layers_max").innerHTML);
+	}
+}
+
+
+function RemoveAllButFirstOption(selectElement) {
+   var i, L = selectElement.options.length - 1;
+   for(i = L; i >= 1; i--) {
+      selectElement.remove(i);
+   }
+}
+
 //=================================================================//
 //  READY/RUNTIME
 //=================================================================//
@@ -1859,6 +1991,8 @@ $(document).ready(function(){
 	
 	// Bind UI references
 	connect_status    = $('#connectstatus');
+	button_loadmodel  = $('#btn_loadmodel');
+	button_showmodel  = $('#btn_showmodel');
 	button_newgame    = $('#btn_newgame');
 	button_rndgame    = $('#btn_rndgame');
 	button_save       = $('#btn_save');
@@ -1912,9 +2046,13 @@ $(document).ready(function(){
 	saveas_accept     = $("#btn_saveasaccept");
 	saveas_close      = $("#btn_saveasclose");
 	loadpopup         = $("#loadcontainer");
+	loadmodelpopup    = $("#loadmodelcontainer");
 	loadcontent       = $("#loadlistcontent");
+	loadmodelcontent  = $("#loadmodellistcontent");
 	load_accept       = $("#btn_loadaccept");
 	load_close        = $("#btn_loadclose");
+	load_model_accept = $("#btn_loadmodelaccept");
+	load_model_close  = $("#btn_loadmodelclose");
 	sppopup           = $("#spcontainer");
 	spcontent         = $("#splistcontent");
 	sp_accept         = $("#btn_spaccept");
@@ -1941,6 +2079,7 @@ $(document).ready(function(){
 	socket = io.connect(window.document.origin, {transports: ['polling', 'websocket'], closeOnBeforeunload: false});
 
 	socket.on('from_server', function(msg) {
+		//console.log(msg);
 		if(msg.cmd == "connected") {
 			// Connected to Server Actions
 			sman_allow_delete = msg.hasOwnProperty("smandelete") && msg.smandelete;
@@ -1954,9 +2093,7 @@ $(document).ready(function(){
 			connect_status.removeClass("color_orange");
 			connect_status.addClass("color_green");
 			// Reset Menus
-			settings_menu.html("");
-			format_menu.html("");
-			wi_menu.html("");
+			reset_menus();
 			// Set up "Allow Editing"
 			$('body').on('input', autofocus);
 			$('#allowediting').prop('checked', allowedit).prop('disabled', false).change().off('change').on('change', function () {
@@ -2018,6 +2155,10 @@ $(document).ready(function(){
 			scrollToBottom();
 		} else if(msg.cmd == "updatechunk") {
 			hideMessage();
+			if (typeof submit_start !== 'undefined') {
+				$("#runtime")[0].innerHTML = `Generation time: ${Math.round((Date.now() - submit_start)/1000)} sec`;
+				delete submit_start;
+			}
 			var index = msg.data.index;
 			var html = msg.data.html;
 			var existingChunk = game_text.children('#n' + index);
@@ -2063,14 +2204,17 @@ $(document).ready(function(){
 				enableButtons([button_actmem, button_actwi, button_actback, button_actfwd, button_actretry]);
 				hideWaitAnimation();
 				gamestate = "ready";
+				favicon.stop_swap();
 			} else if(msg.data == "wait") {
 				gamestate = "wait";
 				disableSendBtn();
 				disableButtons([button_actmem, button_actwi, button_actback, button_actfwd, button_actretry]);
 				showWaitAnimation();
+				favicon.start_swap();
 			} else if(msg.data == "start") {
 				setStartState();
 				gamestate = "ready";
+				favicon.stop_swap();
 			}
 		} else if(msg.cmd == "allowsp") {
 			allowsp = !!msg.data;
@@ -2219,6 +2363,8 @@ $(document).ready(function(){
 		} else if(msg.cmd == "setanotetemplate") {
 			// Set contents of Author's Note Template field
 			$("#anotetemplate").val(msg.data);
+		} else if(msg.cmd == "reset_menus") {
+			reset_menus();
 		} else if(msg.cmd == "addsetting") {
 			// Add setting controls
 			addSetting(msg.data);
@@ -2425,6 +2571,88 @@ $(document).ready(function(){
 			} else {
 				debug_area.addClass("hidden");
 			}
+		} else if(msg.cmd == 'show_model_menu') {
+			//console.log(msg)
+			$("#use_gpu_div").addClass("hidden");
+			$("#modelkey").addClass("hidden");
+			$("#modellayers").addClass("hidden");
+			$("#oaimodel").addClass("hidden")
+			buildLoadModelList(msg.data, msg.menu, msg.breadcrumbs, msg.showdelete);
+		} else if(msg.cmd == 'selected_model_info') {
+			enableButtons([load_model_accept]);
+			$("#oaimodel").addClass("hidden")
+			$("#oaimodel")[0].options[0].selected = true;
+			if (msg.key) {
+				$("#modelkey").removeClass("hidden");
+				$("#modelkey")[0].value = msg.key_value;
+			} else {
+				$("#modelkey").addClass("hidden");
+				
+			}
+			if (msg.url) {
+				$("#modelurl").removeClass("hidden");
+			} else {
+				$("#modelurl").addClass("hidden");
+			}
+			if (msg.gpu) {
+				$("#use_gpu_div").removeClass("hidden");
+			} else {
+				$("#use_gpu_div").addClass("hidden");
+			}
+			if (msg.breakmodel) {
+				var html;
+				$("#modellayers").removeClass("hidden");
+				html = "";
+				for (let i = 0; i < msg.gpu_names.length; i++) {
+					html += "GPU " + i + " " + msg.gpu_names[i] + ": ";
+					html += '<input inputmode="numeric" id="gpu_layers_box_'+i+'" class="justifyright flex-push-right model_layers" value="'+msg.break_values[i]+'" ';
+					html += 'onblur=\'$("#gpu_layers'+i+'")[0].value=$("#gpu_layers_box_'+i+'")[0].value;update_gpu_layers();\'>';
+					html += "<input type='range' class='form-range airange' min='0' max='"+msg.layer_count+"' step='1' value='"+msg.break_values[i]+"' id='gpu_layers"+i+"' onchange='update_gpu_layers();'>";
+				}
+				$("#model_layer_bars").html(html);
+				$("#gpu_layers_max").html(msg.layer_count);
+				$("#gpu_count")[0].value = msg.gpu_count;
+				update_gpu_layers();
+			} else {
+				$("#modellayers").addClass("hidden");
+			}
+		} else if(msg.cmd == 'oai_engines') {
+			$("#oaimodel").removeClass("hidden")
+			selected_item = 0;
+			length = $("#oaimodel")[0].options.length;
+			for (let i = 0; i < length; i++) {
+				$("#oaimodel")[0].options.remove(1);
+			}
+			msg.data.forEach(function (item, index) {
+				var option = document.createElement("option");
+				option.value = item[0];
+				option.text = item[1];
+				if(msg.online_model == item[0]) {
+					selected_item = index+1;
+				}
+				$("#oaimodel")[0].appendChild(option);
+				if(selected_item != "") {
+					$("#oaimodel")[0].options[selected_item].selected = true;
+				}
+			})
+		} else if(msg.cmd == 'show_model_name') {
+			$("#showmodelnamecontent").html("<div class=\"flex\"><div class=\"loadlistpadding\"></div><div class=\"loadlistitem\">" + msg.data + "</div></div>");
+			$("#showmodelnamecontainer").removeClass("hidden");
+		} else if(msg.cmd == 'hide_model_name') {
+			$("#showmodelnamecontainer").addClass("hidden");
+			//console.log("Closing window");
+		} else if(msg.cmd == 'model_load_status') {
+			$("#showmodelnamecontent").html("<div class=\"flex\"><div class=\"loadlistpadding\"></div><div class=\"loadlistitem\" style='align: left'>" + msg.data + "</div></div>");
+			$("#showmodelnamecontainer").removeClass("hidden");
+			//console.log(msg.data);
+		} else if(msg.cmd == 'oai_engines') {
+			RemoveAllButFirstOption($("#oaimodel")[0]);
+			for (const engine of msg.data) {
+				var opt = document.createElement('option');
+				opt.value = engine[0];
+				opt.innerHTML = engine[1];
+				$("#oaimodel")[0].appendChild(opt);
+			}
 		}
 	});
 	
@@ -2673,12 +2901,35 @@ $(document).ready(function(){
 		hideLoadPopup();
 	});
 	
+	load_model_close.on("click", function(ev) {
+		$("#modellayers").addClass("hidden");
+		hideLoadModelPopup();
+	});
+	
 	load_accept.on("click", function(ev) {
 		hideMessage();
 		newly_loaded = true;
 		socket.send({'cmd': 'loadrequest', 'data': ''});
 		hideLoadPopup();
 	});
+	
+	load_model_accept.on("click", function(ev) {
+		hideMessage();
+		var gpu_layers;
+		var message;
+		if($("#modellayers")[0].classList.contains('hidden')) {
+			gpu_layers = ","
+		} else {
+			gpu_layers = ""
+			for (let i=0; i < $("#gpu_count")[0].value; i++) {
+				gpu_layers += $("#gpu_layers"+i)[0].value + ",";
+			}
+		}
+		message = {'cmd': 'load_model', 'use_gpu': $('#use_gpu')[0].checked, 'key': $('#modelkey')[0].value, 'gpu_layers': gpu_layers.slice(0, -1), 'url': $('#modelurl')[0].value, 'online_model': $('#oaimodel')[0].value};
+		socket.send(message);
+		loadmodelcontent.html("");
+		hideLoadModelPopup();
+	});
 
 	sp_close.on("click", function(ev) {
 		hideSPPopup();
@@ -2712,6 +2963,14 @@ $(document).ready(function(){
 		hideSamplersPopup();
 	});
 	
+	button_loadmodel.on("click", function(ev) {
+		showLoadModelPopup();
+		socket.send({'cmd': 'list_model', 'data': 'mainmenu'});
+	});
+	button_showmodel.on("click", function(ev) {
+		socket.send({'cmd': 'show_model', 'data': ''});
+	});
+	
 	button_newgame.on("click", function(ev) {
 		if(connected) {
 			showNewStoryPopup();
diff --git a/static/custom.css b/static/custom.css
index 640cb8db..6d88f1af 100644
--- a/static/custom.css
+++ b/static/custom.css
@@ -368,14 +368,14 @@ body.connected #popupfooter, #popupfooter.always-available {
 	margin-top: 200px;
 }
 
-#loadpopup {
+.loadpopup {
 	width: 500px;
 	background-color: #262626;
 	margin-top: 100px;
 }
 
 @media (max-width: 768px) {
-	#loadpopup {
+	.loadpopup {
 		width: 100%;
 		background-color: #262626;
 		margin-top: 100px;
@@ -1055,7 +1055,7 @@ body.connected .statusiconlabel, .statusiconlabel.always-available {
 }
 
 .loadlistitem {
-	padding: 5px 10px 5px 10px;
+	padding: 0px 0px 0px 0px;
 	display: flex;
 	flex-grow: 1;
 	color: #ffffff;
@@ -1071,6 +1071,28 @@ body.connected .statusiconlabel, .statusiconlabel.always-available {
 	background-color: #688f1f;
 }
 
+.breadcrumbitem {
+	padding: 5px 10px 5px 10px;
+	color: #ffffff;
+	background-color: transparent;
+	border: none;
+	
+	-moz-transition: background-color 0.25s ease-in;
+	-o-transition: background-color 0.25s ease-in;
+	-webkit-transition: background-color 0.25s ease-in;
+	transition: background-color 0.25s ease-in;
+}
+
+.breadcrumbitem:hover {
+	cursor: pointer;
+	background-color: #688f1f;
+}
+
+hr {
+    padding: 0px;
+    margin: 0px;    
+}
+
 .loadlistpadding {
 	padding-right: 10px;
 }
@@ -1462,3 +1484,65 @@ body.connected .popupfooter, .popupfooter.always-available {
 	overflow: hidden;
 	font-size: 12pt;
 }
+
+.model_layers {
+	width: 3ch;
+	background-color: inherit;
+	border: none; 
+	outline: none;
+}
+
+.model_layers:focus {
+	color: #cdf;
+}
+
+.menu_icon {
+	position: fixed;
+	top:10px;
+	left: 5px;
+	z-index:100;
+	display: inline-block;
+	cursor: pointer;
+}
+
+.SideMenu {
+  height: 100%;
+  width: 0;
+  position: fixed;
+  z-index: 1;
+  top: 0;
+  left: 0;
+  background-color: #111;
+  overflow-x: hidden;
+  transition: 0.5s;
+  padding-top: 60px;
+}
+
+.SideMenu.open {
+  width: 450px;
+}
+
+@media only screen and (max-width: 768px) {
+	.SideMenu.open {
+	  width: 100%;
+	}
+}
+
+
+.menubar1, .menubar2, .menubar3 {
+  width: 21px;
+  height: 3px;
+  background-color: #999;
+  margin: 3px 0;
+  transition: 0.4s;
+}
+
+.change .menubar1 {
+  transform: translate(0px, 6px) rotate(-45deg);
+}
+
+.change .menubar2 {opacity: 0;}
+
+.change .menubar3 {
+  transform: translate(0px, -6px) rotate(45deg);
+}
\ No newline at end of file
diff --git a/static/favicon.js b/static/favicon.js
new file mode 100644
index 00000000..180059ff
--- /dev/null
+++ b/static/favicon.js
@@ -0,0 +1,64 @@
+// Global Definitions
+var fav_icon2 = "data:image/x-icon;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAMAAAAoLQ9TAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAB+1BMVEUAAAAAAAAAAAAAAAAAAQAAAAAAAQAAAAAAAAASFhBBWD4iUyoFEwgFEwguUTM+VDoMFAwAAAA+elIudz8AAAAAAAA0MigyLyQAAAAbLh1LdElSbUoVMBkAAABAZ0M2fkUAAAABAQFMiGQraDkAAQANFxEGFQkLFg8EEAYAAAAsZDonZjUAAABCgVVAnFYrSjhEjFpFi1sdRScAAAAjOi8VMxx1dGOFgGYAAABOTEabmIdlYlQaGhgaGhddXFauqY5JRjoAAAAAAAABAQFGeExIl1lX0XRW0XRHi1RFe02vv5W31KFd1Hpc1Hpe1HvO1KvDvJlqZ1plYVOmoIVt1IFl1H7AuZp1cV9jX1AmSCw3Nzg7NmA1MTJuz4Bm1H5MST9HPl9BQEMgNiNXgWKiobFgXICDd5dfw3RZVnJiV3zGv9Bqf29Oj2G/v8hTTpGhl8dbxHVd0npiYoxhWJvIxtlcimZFn1lRclg9SkZNblZBeEpDbEZCa0ZBc0hLY1BAS1BdaV87j01Vx3FWynJSrGZOhlVasGtas2xatm1at21WnWJQm15WyXJQvmlavnBZrGlEYEJWe1RBWz9Um2BavXBgxn9XhllGY0RLaklXiFlTwG5OpmVSfFNMbUpGZEVLa0lShldEhVCChHiKiHvWz6/Kw6WWlZGAfmj///8kr0X+AAAARHRSTlMAASFrcAhxIjLb/vWvsPb+20b4+DFFyMkz2vf43CP9/m5y9vZysLGvsQn19mz+/tz4+NxHycr3+Ejb/vaxsPX+3TRtcBrzrrgAAAABYktHRKhQCDaSAAAAB3RJTUUH5gYJFyQy3tftxgAAAQBJREFUGNNjYGBgYGRiZmFlZWNmZ2SAAA5OLm4eXj5+AQ6ogKCQi6ubu4ensCCIxygiKubl7ePr6+cfIC4owcjAJCkVGBQc4usbGhYeIS0jy8AsFxkVHRPr6xsXn5CYJK/AoKiUnJKalg5UkZGZla2swsCqmpObl1/g61tYVFxSqsbKwKpeVl5RWVVdU1tX39CoocnAotXU3NLa1t7R2dXd06utwqCj6+vb1z9h4sRJk6f4+uopMLDrG0z1nTZ94sQZM31nGRrJMjBKGJvMnjN3wrz5CxaaCnKAvSNqtmjxkqXLlptbQP0iYmllbWNrZ+/gCBVgZHdS1GR1VpAFqQcApI0/jqlZOvEAAAAldEVYdGRhdGU6Y3JlYXRlADIwMjItMDYtMDlUMjM6MzY6NTArMDA6MDDi0xr+AAAAJXRFWHRkYXRlOm1vZGlmeQAyMDIyLTA2LTA5VDIzOjM2OjUwKzAwOjAwk46iQgAAAABJRU5ErkJggg==";
+var fav_icon1 = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAMAAAAoLQ9TAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAB+FBMVEUAAAAAAAAAAAAAAAAAAAEAAAAAAQEAAAAAAAAUFRlLVGYrSWgHEBoHEBk3S19HUGMOExkAAABOcos7apIAAAAAAAA2Ly01KyoAAAAgKzdVaX9bZHIaKzwAAABKYHhDcZgAAAABAQFfgJY2XX0AAQEQFhoIEhwOFRgGDRUAAAAAAQE3W3cyWnwAAABSeJJRjLs1R1FVgaFWgJ4lPlMAAAAsOD4aLj55bm2Md3QAAABPSkmfko9pXlsbGRkbGRlfWlm1oJxMQkAAAAAAAAABAQFTb4tYibFtvPpWgKNScpC6s7nExtNzwPp1wPnZx8jMsKtuZGFoXVutmJODwfJ7wfbHr6p5a2hnW1gtQlI4ODk7N2A2LzWDvet8wPZPRkRHPl9CQUQlMTthe4+ko7RhXYGEeJhzsuJaVXRjWHzIwtNwfYddhqLCwcpTTpGimMhvsuVzv/djYpBgWJvLydxlgptVirdZbX1ASFZUaXtOb4xOZX1OZHxNa4ZRX21DSV5gaG9Je6lqsepstO1knclcfJxtoc5tpNFuptVup9ZnkbdgjrVss+xjpuBvrd9snspOW29jdI5LVmlkj7Vvrd54t+RlfptQXXJWZHtlf51oruNgmMFfdJBYZn1RXnRWZXthfZxSeZiGgYGOhYLdxb/RubWZlpWFd3T////2kwjgAAAARXRSTlMAASFrcAhxIjLb/vWvsPb+20b4+DFFyMkz2vf43CP9/m5y9vZysLGvsQlw9fZs/v7c+PjcR8nK9/hI2/72sbD1/t00bXBAFktiAAAAAWJLR0SnwLcrAwAAAAd0SU1FB+YGCRchHQhxJNoAAAD/SURBVBjTY2BgYGBkYmZhZWVjZmdkgAAOTi5uHl4+fgEOqICgkKubu7uHp7AgiMcoIirm5e3j4+Pr5y8uKMHIwCQpFRAYFOzjExIaFi4tI8vALBcRGRUd4+MTGxefkCivwKColJSckpoGVJGekZmlrMLAqpqdk5uX7+NTUFhUXKLGysCqXlpWXlFZVV1TW1ffoKHJoKXd2NTc0trW3tHZ1d2jo8Kgq+fj09vXP2HCxEmTfXz0FRjYDQyn+EydNmHC9Bk+M42MZRkYJUxMZ82e0z933vwFZoIcYO+Imi9ctHjJ0mUWllC/iFhZ29ja2Ts4OkEFGNmdFTVZXRRkQeoBhkE/Yj5NSZ4AAAAldEVYdGRhdGU6Y3JlYXRlADIwMjItMDYtMDlUMjM6MzM6MjgrMDA6MDA90JbEAAAAJXRFWHRkYXRlOm1vZGlmeQAyMDIyLTA2LTA5VDIzOjMzOjI4KzAwOjAwTI0ueAAAAABJRU5ErkJggg==";
+var fav_icon = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAMAAAAoLQ9TAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAB8lBMVEUAAAAAAAAAAAAAAAABAAAAAAABAAAAAAAAAAAdEBB0Pz5rKCgaBwcZBwdkMzJxPDocDAwAAACLTU6SOzsAAAAAAAA9Mic/LyEAAAA6HByQUUaIVEY+GBgAAACAQkKaQUIAAAABAQGWXl9+NjYBAAAaEBAcCAgZDQ0WBQUAAAB3Nzd9MjIAAACTUVK7UVJRNTWhVVaeVldTJSUAAAA+LC0+GhuGcmCgf2EAAABUTESrl4NzYlEdGhcdGhdiXFbIqIhWRjcAAAAAAAABAQGUSkq1VVX6bW6oUVGXS0vmro7+uJn6c3T6dXX/yqPnu5F3aFhxYVG/oH/7gHv6enjeuJOEcFtzX01VLCs4ODk7NmA5MTH1gHr6e3hWSTxHPl9CQUQ/JCKPYGGko7RhXYGEeJjmcW9cVnFjWH3IwtOHb3CjXV3CwcpTTpGimMjlb3D4c3RmYI1gWJvLydybZWW+T0x+V1hRP0Z7U1WTSEiHRUWGRUSORkZuTlBRQVBwX2CvRkXtaGjvamrNYWKmU1PVZ2fXaGjbaWncaWnAX1+7W1vkYF/ja2zRZWV9QkGeVFN2Pz69XV3ia2zkeHmpWFd/REOJSUirWVjjaGjBYGCeUlKMSkl8QkGBRUSoVlWeUE2QgXeWiHr1zqjmw5+bl5KVe2T///8NZLRGAAAARHRSTlMAASFrcAhxIjLb/vWvsPb+20b4+DFFyMkz2vf43CP9/m5y9vZysLGvsQn19mz+/tz4+NxHycr3+Ejb/vaxsPX+3TRtcBrzrrgAAAABYktHRKUuuUovAAAAB3RJTUUH5gYJFzsfVlK/LQAAAP9JREFUGNNjYGBgYGRiZmFlZWNmZ2SAAA5OLm4eXj5+AQ6ogKCQi6ubm7uHsCCIxygiKubp5e3t7ePrJy4owcjAJCnlHxAY5O0dHBIaJi0jy8AsFx4RGRXt7R0TGxefIK/AoKiUmJSckgpUkZaekamswsCqmpWdk5vn7Z1fUFhUrMbKwKpeUlpWXlFZVV1TW1evocnAotXQ2NTc0trW3tHZ2KWtwqCj6+3d3dPb19c/YaK3t54CA7u+wSTvyVP6+qZO855uaCTLwChhbDJj5qzZc6bOnWcqyAH2jqjZ/AULFy1eYm4B9YuIpZW1ja2dvYMjVICR3UlRk9VZQRakHgAlRz6K4dvoSgAAACV0RVh0ZGF0ZTpjcmVhdGUAMjAyMi0wNi0wOVQyMzo1OTozMSswMDowMJt1iQMAAAAldEVYdGRhdGU6bW9kaWZ5ADIwMjItMDYtMDlUMjM6NTk6MzErMDA6MDDqKDG/AAAAAElFTkSuQmCC"
+
+var favicon = {
+
+	// Change the Page Icon and Title.
+	change: function(iconURL) {
+		this.addLink(iconURL, "icon");
+		this.addLink(iconURL, "shortcut icon");
+	},
+
+	addLink: function(iconURL, relValue) {
+		var link = document.createElement("link");
+		link.type = "image/x-icon";
+		link.rel = relValue;
+		link.href = iconURL;
+		this.removeLink(relValue);
+		this.docHead.appendChild(link);
+	},
+
+	removeLink: function(relValue) {
+		var links = this.docHead.getElementsByTagName("link");
+		for (var i = 0; i < links.length; i++) {
+			var link = links[i];
+			if (link.type == "image/x-icon" && link.rel == relValue) {
+				this.docHead.removeChild(link);
+				return; // Assuming only one match at most.
+			}
+		}
+	},
+	
+	swapLink: function() {
+		if (this.run == true) {
+			if (this.icon == 1) {
+				this.change(fav_icon2);
+				this.icon = 2;
+			} else {
+				this.change(fav_icon1);
+				this.icon = 1;
+			}
+		}
+	},
+	
+	auto_swap: function() {
+		if (this.run == true) {
+			this.swapLink();
+			setTimeout(() => {  this.auto_swap(); }, 1000);
+		}
+	},
+	
+	start_swap: function() {
+		this.run = true;
+		this.auto_swap();
+	},
+	
+	stop_swap: function() {
+		this.run = false;
+		this.change(fav_icon);
+	},
+	
+	docHead:document.getElementsByTagName("head")[0]
+}
\ No newline at end of file
diff --git a/templates/index.html b/templates/index.html
index 36264e7a..0d77fd49 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -9,7 +9,7 @@
 	<link rel="stylesheet" href="static/bootstrap.min.css">
 	<link rel="stylesheet" href="static/bootstrap-toggle.min.css">
 	<link rel="stylesheet" href="static/open-iconic-bootstrap.min.css">
-	<link rel="stylesheet" href="static/custom.css?ver=1.18.1a">
+	<link rel="stylesheet" href="static/custom.css?ver=1.18.1b">
 
 	<script src="static/jquery-3.6.0.min.js"></script>
 	<script src="static/jquery-ui.sortable.min.js"></script>
@@ -17,7 +17,8 @@
 	<script src="static/bootstrap.min.js"></script>
 	<script src="static/bootstrap-toggle.min.js"></script>
 	<script src="static/rangy-core.min.js"></script>
-	<script src="static/application.js?ver=1.18.1a"></script>
+	<script src="static/application.js?ver=1.18.1b"></script>
+	<script src="static/favicon.js"></script>
 </head>
 <body>
 	<input type="file" id="remote-save-select" accept="application/json" style="display:none">
@@ -33,6 +34,15 @@
 						</button>
 						<div class="collapse navbar-collapse" id="navbarNavDropdown">
 							<ul class="nav navbar-nav">
+								{% if not hide_ai_menu %}
+								<li class="nav-item dropdown">
+									<a class="nav-link dropdown-toggle" href="#" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">AI</a>
+									<div class="dropdown-menu">
+										<a class="dropdown-item" href="#" id="btn_loadmodel">Load Model</a>
+										<a class="dropdown-item" href="#" id="btn_showmodel">Model Info</a>
+									</div>
+								</li>
+								{% endif %}
 								<li class="nav-item dropdown">
 									<a class="nav-link dropdown-toggle" href="#" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">New Game</a>
 									<div class="dropdown-menu">
@@ -86,6 +96,7 @@
 				</div>
 				<div id="connectstatusdiv" class="flex-row-container">
 					<span id="connectstatus" class="color_orange flex-row">Waiting for connection...</span>
+					<div class="layer-container status-container flex-push-left" style="color: #FFFFFF;" id="runtime"></div>
 					<div class="layer-container status-container flex-push-right">
 						<span class="oi oi-puzzle-piece statusicon layer-bottom" aria-hidden="true">
 							<div class="statustext statustext-wide">
@@ -252,7 +263,7 @@
 		</div>
 	</div>
 	<div class="popupcontainer hidden" id="loadcontainer">
-		<div id="loadpopup">
+		<div class="loadpopup" id="loadpopup">
 			<div class="popuptitlebar">
 				<div class="popuptitletext">Select A Story To Load</div>
 			</div>
@@ -268,6 +279,59 @@
 			</div>
 		</div>
 	</div>
+	<div class="popupcontainer hidden" id="loadmodelcontainer">
+		<div class="loadpopup">
+			<div class="popuptitlebar">
+				<div class="popuptitletext">Select A Model To Load</div>
+			</div>
+			<div id="loadmodellistbreadcrumbs">
+				
+			</div>
+			<div id="loadmodellistcontent" style="overflow: scroll; height: 300px;">
+			</div>
+			<div class="popupfooter">
+				<input class="form-control hidden" type="text" placeholder="key" id="modelkey" onblur="socket.send({'cmd': 'OAI_Key_Update', 'key': $('#modelkey')[0].value});">
+				<input class="form-control hidden" type="text" placeholder="Enter the URL of the server (For example a trycloudflare link)" id="modelurl" onchange="check_enable_model_load()">
+				<input class="form-control hidden" type="text" placeholder="Model Path or Hugging Face Name" id="custommodelname" menu="" onblur="socket.send({'cmd': 'selectmodel', 'data': $(this).attr('menu'), 'path_modelname': $('#custommodelname')[0].value});">
+			</div>
+			<div class="popupfooter">
+				<select class="form-control hidden" id="oaimodel"><option value="">Select OAI Model</option></select>
+			</div>
+			<div class="popupfooter hidden" id=modellayers>
+				<div class='settingitem' style="width:100%">
+					<div class='settinglabel'>
+						<div class="justifyleft">
+							GPU Layers
+							<span class="helpicon">?
+								<span class="helptext">Number of layers to assign to the GPU</span>
+							</span>
+						</div>
+						<div class="justifyright" id="gpu_layers_current">0</div>
+					</div>
+					<div id=model_layer_bars style="color: white">
+						
+					</div>
+					<input type=hidden id='gpu_count' value=0/>
+					<div class="settingminmax">
+						<div class="justifyleft">
+							0
+						</div>
+						<div class="justifyright" id="gpu_layers_max">
+							24
+						</div>
+					</div>
+				</div>
+			</div>
+			<div class="popupfooter">
+				<button type="button" class="btn btn-primary" id="btn_loadmodelaccept">Load</button>
+				<button type="button" class="btn btn-primary" id="btn_loadmodelclose">Cancel</button>
+				<div class="box flex-push-right hidden" id=use_gpu_div>
+					<input type="checkbox" data-toggle="toggle" data-onstyle="success" id="use_gpu" checked>
+					<div class="box-label">Use GPU</div>
+				</div>
+			</div>
+		</div>
+	</div>
 	<div class="popupcontainer hidden" id="spcontainer">
 		<div id="sppopup">
 			<div class="popuptitlebar">
@@ -367,6 +431,19 @@
 			</div>
 		</div>
 	</div>
+	<div class="popupcontainer hidden flex" id="showmodelnamecontainer" style="center;">
+		<div class="loadpopup">
+			<div class="popuptitlebar" style="width:50% center;">
+				<div class="popuptitletext">Model Info</div>
+			</div>
+			<div id=showmodelnamecontent style="width:50%;">
+				Read Only
+			</div>
+			<div class="popupfooter" style="width:50% center;">
+				<button type="button" class="btn btn-primary" onclick='$("#showmodelnamecontainer").addClass("hidden");'>OK</button>
+			</div>
+		</div>
+	</div>
 	<div class="popupcontainer hidden" id="rndgamecontainer">
 		<div id="rspopup">
 			<div class="popuptitlebar">
diff --git a/test_aiserver.py b/test_aiserver.py
new file mode 100644
index 00000000..855b71f5
--- /dev/null
+++ b/test_aiserver.py
@@ -0,0 +1,220 @@
+import pytest, time
+import aiserver
+
+#Test Model List:
+test_models = [
+                ('EleutherAI/gpt-neo-1.3B', {'key': False, 'gpu': False, 'layer_count': 24, 'breakmodel': True, 'url': False}), 
+                ('gpt2', {'key': False, 'gpu': False, 'layer_count': 12, 'breakmodel': True, 'url': False}), 
+                ('facebook/opt-350m', {'key': False, 'gpu': False, 'layer_count': 24, 'breakmodel': True, 'url': False})
+              ]
+
+@pytest.fixture
+def client_data():
+    app = aiserver.app
+    #app.test_client_class = FlaskLoginClient
+    client_conn = app.test_client()
+    socketio_client = aiserver.socketio.test_client(app, flask_test_client=client_conn)
+    #Clear out the connection message
+    response = socketio_client.get_received()
+    return (client_conn, app, socketio_client)
+
+
+def get_model_menu(model):
+    for menu in aiserver.model_menu:
+        for item in aiserver.model_menu[menu]:
+            if item[1] == model:
+                for main_menu_line in aiserver.model_menu['mainmenu']:
+                    if main_menu_line[1] == menu:
+                        return (menu, main_menu_line, item)
+    return None
+    
+def generate_story_data(client_data):
+    (client, app, socketio_client) = client_data
+    socketio_client.emit('message',{'cmd': 'submit', 'allowabort': False, 'actionmode': 0, 'chatname': None, 'data': ''})
+    
+    #wait until the game state turns back to start
+    state = 'wait'
+    new_text = None
+    start_time = time.time()
+    timeout = time.time() + 60*1
+    while state == 'wait':
+        if time.time() > timeout:
+            break
+        responses = socketio_client.get_received()
+        for response in responses:
+            response = response['args'][0]
+            print(response)
+            if response['cmd'] == 'setgamestate':
+                state = response['data']
+            elif response['cmd'] == 'updatechunk' or response['cmd'] == 'genseqs':
+                new_text = response['data']
+        time.sleep(0.1)
+    
+    assert new_text is not None
+
+def test_basic_connection(client_data):
+    (client, app, socketio_client) = client_data
+    response = client.get("/")
+    assert response.status_code == 200
+
+def test_load_story_from_web_ui(client_data):
+    (client, app, socketio_client) = client_data
+    
+    #List out the stories and make sure we have the sample story
+    socketio_client.emit('message',{'cmd': 'loadlistrequest', 'data': ''})
+    response = socketio_client.get_received()[0]['args'][0]['data']
+    found_sample_story = False
+    for story in response:
+        if story['name'] == 'sample_story':
+            found_sample_story = True
+    assert found_sample_story
+    
+    #Click on the sample story, then click load
+    socketio_client.emit('message',{'cmd': 'loadselect', 'data': 'sample_story'})
+    socketio_client.emit('message',{'cmd': 'loadrequest', 'data': ''})
+    
+    #Wait until we get the data back from the load
+    loaded_story = False
+    timeout = time.time() + 60*2
+    while not loaded_story:
+        if time.time() > timeout:
+            break
+        responses = socketio_client.get_received()
+        for response in responses:
+            response = response['args'][0]
+            if 'cmd' not in response:
+                print(response)
+                assert False
+            if response['cmd'] == 'updatescreen':
+                loaded_story = True
+                story_text = response['data']
+                break
+    assert loaded_story
+    
+    #Verify that it's the right story data
+    assert story_text == '<chunk n="0" id="n0" tabindex="-1">Niko the kobold stalked carefully down the alley, his small scaly figure obscured by a dusky cloak that fluttered lightly in the cold winter breeze. Holding up his tail to keep it from dragging in the dirty snow that covered the cobblestone, he waited patiently for the butcher to turn his attention from his stall so that he could pilfer his next meal: a tender-looking</chunk><chunk n="1" id="n1" tabindex="-1"> chicken. He crouched just slightly as he neared the stall to ensure that no one was watching, not that anyone would be dumb enough to hassle a small kobold. What else was there for a lowly kobold to</chunk><chunk n="2" id="n2" tabindex="-1"> do in a city? All that Niko needed to know was</chunk><chunk n="3" id="n3" tabindex="-1"> where to find the chicken and then how to make off with it.<br/><br/>A soft thud caused Niko to quickly lift his head. Standing behind the stall where the butcher had been cutting his chicken,</chunk>'  
+
+@pytest.mark.parametrize("model, expected_load_options", test_models)
+def test_load_model_from_web_ui(client_data, model, expected_load_options):
+    (client, app, socketio_client) = client_data
+    
+    #Clear out any old messages
+    response = socketio_client.get_received()
+    
+    (menu, menu_line, model_line) = get_model_menu(model)
+    
+    #Send the ai load model menu option
+    socketio_client.emit('message',{'cmd': 'list_model', 'data': 'mainmenu'})
+    response = socketio_client.get_received()[0]['args'][0]['data']
+    assert menu_line in response
+    
+    #Send the click model menu option
+    socketio_client.emit('message',{'cmd': 'list_model', 'data': menu, 'pretty_name': ""})
+    response = socketio_client.get_received()[0]['args'][0]['data']
+    assert model_line in response
+    
+    #Click the model
+    socketio_client.emit('message',{'cmd': 'selectmodel', 'data': model})
+    response = socketio_client.get_received()[0]['args'][0]
+    #Check that we're getting the right load options
+    print(response)
+    assert response['key'] == expected_load_options['key']
+    assert response['gpu'] == expected_load_options['gpu']
+    assert response['layer_count'] == expected_load_options['layer_count']
+    assert response['breakmodel'] == expected_load_options['breakmodel']
+    assert response['url'] == expected_load_options['url']
+    
+    #Now send the load 
+    socketio_client.emit('message',{'cmd': 'load_model', 'use_gpu': True, 'key': '', 'gpu_layers': '', 'url': '', 'online_model': ''})
+    #wait until the game state turns back to start
+    state = 'wait'
+    start_time = time.time()
+    timeout = time.time() + 60*2
+    while state == 'wait':
+        if time.time() > timeout:
+            break
+        responses = socketio_client.get_received()
+        for response in responses:
+            response = response['args'][0]
+            if response['cmd'] == 'setgamestate':
+                state = response['data']
+        time.sleep(0.1)
+    
+    #Give it a second to get all of the settings, etc and clear out the messages
+    responses = socketio_client.get_received()
+    
+    #check the model info to see if it's loaded
+    socketio_client.emit('message',{'cmd': 'show_model', 'data': ''})
+    response = socketio_client.get_received()[0]['args'][0]
+    assert response == {'cmd': 'show_model_name', 'data': model}
+    
+    generate_story_data(client_data)
+  
+def test_load_GooseAI_from_web_ui(client_data):
+    
+    pytest.skip("unsupported configuration")
+
+@pytest.mark.parametrize("model, expected_load_options", test_models)
+def test_load_model_from_command_line(client_data, model, expected_load_options):
+    (client, app, socketio_client) = client_data
+    
+    #Clear out any old messages
+    response = socketio_client.get_received()
+    
+    (menu, menu_line, model_line) = get_model_menu(model)
+    
+    aiserver.general_startup("--model {}".format(model))
+    
+    aiserver.load_model(initial_load=True)
+    
+    #check the model info to see if it's loaded
+    socketio_client.emit('message',{'cmd': 'show_model', 'data': ''})
+    response = socketio_client.get_received()[0]['args'][0]
+    assert response == {'cmd': 'show_model_name', 'data': model}
+    
+    generate_story_data(client_data)
+
+def test_back_redo(client_data):
+    (client, app, socketio_client) = client_data
+    
+    
+    #Make sure we have known story in the ui
+    test_load_story_from_web_ui(client_data)
+    
+    #Clear out any old messages
+    response = socketio_client.get_received()
+    
+    #run a back action
+    socketio_client.emit('message',{'cmd': 'back', 'data': ''})
+    response = socketio_client.get_received()[0]['args'][0]
+    assert response == {'cmd': 'removechunk', 'data': 3}
+    
+    #Run a redo action
+    socketio_client.emit('message',{'cmd': 'redo', 'data': ''})
+    response = socketio_client.get_received()[0]['args'][0]
+    assert response == {'cmd': 'updatechunk', 'data': {'index': 3, 'html': '<chunk n="3" id="n3" tabindex="-1"> where to find the chicken and then how to make off with it.<br/><br/>A soft thud caused Niko to quickly lift his head. Standing behind the stall where the butcher had been cutting his chicken,</chunk>'}}
+    
+    #Go all the way back, then all the way forward
+    socketio_client.emit('message',{'cmd': 'back', 'data': ''})
+    response = socketio_client.get_received()[0]['args'][0]
+    assert response == {'cmd': 'removechunk', 'data': 3}
+    socketio_client.emit('message',{'cmd': 'back', 'data': ''})
+    response = socketio_client.get_received()[0]['args'][0]
+    assert response == {'cmd': 'removechunk', 'data': 2}
+    socketio_client.emit('message',{'cmd': 'back', 'data': ''})
+    response = socketio_client.get_received()[0]['args'][0]
+    assert response == {'cmd': 'removechunk', 'data': 1}
+    socketio_client.emit('message',{'cmd': 'back', 'data': ''})
+    response = socketio_client.get_received()[0]['args'][0]
+    assert response == {'cmd': 'errmsg', 'data': 'Cannot delete the prompt.'}
+    socketio_client.emit('message',{'cmd': 'redo', 'data': ''})
+    socketio_client.emit('message',{'cmd': 'redo', 'data': ''})
+    socketio_client.emit('message',{'cmd': 'redo', 'data': ''})
+    response = socketio_client.get_received()
+    assert response == [{'name': 'from_server', 'args': [{'cmd': 'updatescreen', 'gamestarted': True, 'data': '<chunk n="0" id="n0" tabindex="-1">Niko the kobold stalked carefully down the alley, his small scaly figure obscured by a dusky cloak that fluttered lightly in the cold winter breeze. Holding up his tail to keep it from dragging in the dirty snow that covered the cobblestone, he waited patiently for the butcher to turn his attention from his stall so that he could pilfer his next meal: a tender-looking</chunk><chunk n="1" id="n1" tabindex="-1"> chicken. He crouched just slightly as he neared the stall to ensure that no one was watching, not that anyone would be dumb enough to hassle a small kobold. What else was there for a lowly kobold to</chunk>'}], 'namespace': '/'}, {'name': 'from_server', 'args': [{'cmd': 'texteffect', 'data': 1}], 'namespace': '/'}, {'name': 'from_server', 'args': [{'cmd': 'updatechunk', 'data': {'index': 2, 'html': '<chunk n="2" id="n2" tabindex="-1"> do in a city? All that Niko needed to know was</chunk>'}}], 'namespace': '/'}, {'name': 'from_server', 'args': [{'cmd': 'texteffect', 'data': 2}], 'namespace': '/'}, {'name': 'from_server', 'args': [{'cmd': 'updatechunk', 'data': {'index': 3, 'html': '<chunk n="3" id="n3" tabindex="-1"> where to find the chicken and then how to make off with it.<br/><br/>A soft thud caused Niko to quickly lift his head. Standing behind the stall where the butcher had been cutting his chicken,</chunk>'}}], 'namespace': '/'}, {'name': 'from_server', 'args': [{'cmd': 'texteffect', 'data': 3}], 'namespace': '/'}]
+    
+    
+    
+    
+
+    
\ No newline at end of file
diff --git a/utils.py b/utils.py
index 96606269..69c3ad77 100644
--- a/utils.py
+++ b/utils.py
@@ -151,7 +151,7 @@ def decodenewlines(txt):
 #  Returns number of layers given an HF model config
 #==================================================================#
 def num_layers(config):
-    return config.num_layers if hasattr(config, "num_layers") else config.n_layer if hasattr(config, "n_layer") else config.num_hidden_layers
+    return config.num_layers if hasattr(config, "num_layers") else config.n_layer if hasattr(config, "n_layer") else config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else None
 
 #==================================================================#
 #  Downloads huggingface checkpoints using aria2c if possible