diff --git a/aiserver.py b/aiserver.py
index 5a9b0b50..8cdc7b64 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -1,8 +1,8 @@
 #!/usr/bin/python3
 #==================================================================#
 # KoboldAI
-# Version: 1.18.1
-# By: KoboldAIDev and the KoboldAI Community
+# Version: 1.19.0
+# By: The KoboldAI Community
 #==================================================================#
 
 # External packages
@@ -17,6 +17,8 @@ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 from eventlet import tpool
 
 import logging
+from logger import logger, set_logger_verbosity, quiesce_logger
+
 logging.getLogger("urllib3").setLevel(logging.ERROR)
 
 from os import path, getcwd
@@ -57,7 +59,7 @@ from utils import debounce
 import utils
 import structures
 import torch
-from transformers import StoppingCriteria, GPT2TokenizerFast, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel, modeling_utils
+from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel, modeling_utils
 from transformers import __version__ as transformers_version
 import transformers
 try:
@@ -65,11 +67,12 @@ try:
 except:
     pass
 import transformers.generation_utils
+
 global tpu_mtj_backend
 
 
 if lupa.LUA_VERSION[:2] != (5, 4):
-    print(f"Please install lupa==1.10. You have lupa {lupa.__version__}.", file=sys.stderr)
+    logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
 
 patch_causallm_patched = False
 
@@ -123,16 +126,19 @@ model_menu = {
         ["Untuned OPT", "optlist", "", True],
         ["Untuned GPT-Neo/J", "gptneolist", "", True],
         ["Untuned Fairseq Dense", "fsdlist", "", True],
+        ["Untuned Bloom", "bloomlist", "", True],
         ["Untuned XGLM", "xglmlist", "", True],
         ["Untuned GPT2", "gpt2list", "", True],
         ["Online Services", "apilist", "", True],
         ["Read Only (No AI)", "ReadOnly", "", False]
         ],
     'adventurelist': [
+        ["Skein 20B", "KoboldAI/GPT-NeoX-20B-Skein", "64GB", False],
+        ["Nerys OPT 13B V2 (Hybrid)", "KoboldAI/OPT-13B-Nerys-v2", "32GB", False],
         ["Nerys FSD 13B V2 (Hybrid)", "KoboldAI/fairseq-dense-13B-Nerys-v2", "32GB", False],
         ["Nerys FSD 13B (Hybrid)", "KoboldAI/fairseq-dense-13B-Nerys", "32GB", False],
         ["Skein 6B", "KoboldAI/GPT-J-6B-Skein", "16GB", False],
-        ["OPT Nerys 6B V2", "KoboldAI/OPT-6B-nerys-v2", "16GB", False],
+        ["OPT Nerys 6B V2 (Hybrid)", "KoboldAI/OPT-6B-nerys-v2", "16GB", False],
         ["Adventure 6B", "KoboldAI/GPT-J-6B-Adventure", "16GB", False],
         ["Nerys FSD 2.7B (Hybrid)", "KoboldAI/fairseq-dense-2.7B-Nerys", "8GB", False],
         ["Adventure 2.7B", "KoboldAI/GPT-Neo-2.7B-AID", "8GB", False],
@@ -141,10 +147,11 @@ model_menu = {
         ["Return to Main Menu", "mainmenu", "", True],
         ],
     'novellist': [
+        ["Nerys OPT 13B V2 (Hybrid)", "KoboldAI/OPT-13B-Nerys-v2", "32GB", False],
         ["Nerys FSD 13B V2 (Hybrid)", "KoboldAI/fairseq-dense-13B-Nerys-v2", "32GB", False],
         ["Janeway FSD 13B", "KoboldAI/fairseq-dense-13B-Janeway", "32GB", False],
         ["Nerys FSD 13B (Hybrid)", "KoboldAI/fairseq-dense-13B-Nerys", "32GB", False],
-        ["OPT Nerys 6B V2", "KoboldAI/OPT-6B-nerys-v2", "16GB", False],
+        ["OPT Nerys 6B V2 (Hybrid)", "KoboldAI/OPT-6B-nerys-v2", "16GB", False],
         ["Janeway FSD 6.7B", "KoboldAI/fairseq-dense-6.7B-Janeway", "16GB", False],
         ["Janeway Neo 6B", "KoboldAI/GPT-J-6B-Janeway", "16GB", False],
         ["Janeway Neo 2.7B", "KoboldAI/GPT-Neo-2.7B-Janeway", "8GB", False],
@@ -155,10 +162,15 @@ model_menu = {
         ["Return to Main Menu", "mainmenu", "", True],
         ],
     'nsfwlist': [
+        ["Erebus 20B (NSFW)", "KoboldAI/GPT-NeoX-20B-Erebus", "64GB", False],
+        ["Erebus 13B (NSFW)", "KoboldAI/OPT-13B-Erebus", "32GB", False],
         ["Shinen FSD 13B (NSFW)", "KoboldAI/fairseq-dense-13B-Shinen", "32GB", False],
+        ["Erebus 6.7B (NSFW)", "KoboldAI/OPT-6.7B-Erebus", "16GB", False],
         ["Shinen FSD 6.7B (NSFW)", "KoboldAI/fairseq-dense-6.7B-Shinen", "16GB", False],
+        ["Lit V2 6B (NSFW)", "hakurei/litv2-6B-rev3", "16GB", False],
         ["Lit 6B (NSFW)", "hakurei/lit-6B", "16GB", False],
         ["Shinen 6B (NSFW)", "KoboldAI/GPT-J-6B-Shinen", "16GB", False],
+        ["Erebus 2.7B (NSFW)", "KoboldAI/OPT-2.7B-Erebus", "8GB", False],
         ["Horni 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Horni", "8GB", False],
         ["Shinen 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Shinen", "8GB", False],
         ["Return to Main Menu", "mainmenu", "", True],
@@ -170,6 +182,7 @@ model_menu = {
         ["Return to Main Menu", "mainmenu", "", True],
         ],
     'gptneolist': [
+        ["GPT-NeoX 20B", "EleutherAI/gpt-neox-20b", "64GB", False],
         ["GPT-J 6B", "EleutherAI/gpt-j-6B", "16GB", False],
         ["GPT-Neo 2.7B", "EleutherAI/gpt-neo-2.7B", "8GB", False],
         ["GPT-Neo 1.3B", "EleutherAI/gpt-neo-1.3B", "6GB", False],
@@ -183,6 +196,14 @@ model_menu = {
         ["GPT-2", "gpt2", "2GB", False],
         ["Return to Main Menu", "mainmenu", "", True],
         ],
+    'bloomlist': [
+        ["Bloom 176B", "bigscience/bloom", "", False],
+        ["Bloom 7.1B", "bigscience/bloom-7b1", "", False],   
+        ["Bloom 3B", "bigscience/bloom-3b", "", False], 
+        ["Bloom 1.7B", "bigscience/bloom-1b7", "", False], 
+        ["Bloom 560M", "bigscience/bloom-560m", "", False], 
+        ["Return to Main Menu", "mainmenu", "", True],
+        ],
     'optlist': [
         ["OPT 66B", "facebook/opt-66b", "128GB", False],
         ["OPT 30B", "facebook/opt-30b", "64GB", False],
@@ -217,6 +238,7 @@ model_menu = {
         ["InferKit API (requires API key)", "InferKit", "", False],
         # ["KoboldAI Server API (Old Google Colab)", "Colab", "", False],
         ["KoboldAI API", "API", "", False],
+        ["KoboldAI Horde", "CLUSTER", "", False],
         ["Return to Main Menu", "mainmenu", "", True],
     ]
     }
@@ -238,7 +260,8 @@ class vars:
     lastact     = ""     # The last action received from the user
     submission  = ""     # Same as above, but after applying input formatting
     lastctx     = ""     # The last context submitted to the generator
-    model       = ""     # Model ID string chosen at startup
+    model       = "ReadOnly"     # Model ID string chosen at startup
+    online_model = ""     # Used when Model ID is an online service, and there is a secondary option for the actual model name
     model_selected = ""  #selected model in UI
     model_type  = ""     # Model Type (Automatically taken from the model config)
     noai        = False  # Runs the script without starting up the transformers pipeline
@@ -307,7 +330,7 @@ class vars:
     badwordsids_opt = [[44717], [46613], [48513], [49923], [50185], [48755], [8488], [43303], [49659], [48601], [49817], [45405], [48742], [49925], [47720], [11227], [48937], [48784], [50017], [42248], [49310], [48082], [49895], [50025], [49092], [49007], [8061], [44226], [0], [742], [28578], [15698], [49784], [46679], [39365], [49281], [49609], [48081], [48906], [46161], [48554], [49670], [48677], [49721], [49632], [48610], [48462], [47457], [10975], [46077], [28696], [48709], [43839], [49798], [49154], [48203], [49625], [48395], [50155], [47161], [49095], [48833], [49420], [49666], [48443], [22176], [49242], [48651], [49138], [49750], [40389], [48021], [21838], [49070], [45333], [40862], [1], [49915], [33525], [49858], [50254], [44403], [48992], [48872], [46117], [49853], [47567], [50206], [41552], [50068], [48999], [49703], [49940], [49329], [47620], [49868], [49962], [2], [44082], [50236], [31274], [50260], [47052], [42645], [49177], [17523], [48691], [49900], [49069], [49358], [48794], [47529], [46479], [48457], [646], [49910], [48077], [48935], [46386], [48902], [49151], [48759], [49803], [45587], [48392], [47789], [48654], [49836], [49230], [48188], [50264], [46844], [44690], [48505], [50161], [27779], [49995], [41833], [50154], [49097], [48520], [50018], [8174], [50084], [49366], [49526], [50193], [7479], [49982], [3]]
     fp32_model  = False  # Whether or not the most recently loaded HF model was in fp32 format
     deletewi    = None   # Temporary storage for UID to delete
-    wirmvwhtsp  = False  # Whether to remove leading whitespace from WI entries
+    wirmvwhtsp  = True  # Whether to remove leading whitespace from WI entries
     widepth     = 3      # How many historical actions to scan for WI hits
     mode        = "play" # Whether the interface is in play, memory, or edit mode
     editln      = 0      # Which line was last selected in Edit Mode
@@ -318,6 +341,7 @@ class vars:
     colaburl    = ""     # Ngrok url for Google Colab mode
     apikey      = ""     # API key to use for InferKit API calls
     oaiapikey   = ""     # API key to use for OpenAI API calls
+    cluster_requested_models = [] # The models which we allow to generate during cluster mode
     savedir     = getcwd()+"\\stories"
     hascuda     = False  # Whether torch has detected CUDA on the system
     usegpu      = False  # Whether to launch pipeline with GPU support
@@ -359,7 +383,6 @@ class vars:
     actionmode  = 1
     dynamicscan = False
     host        = False
-    flaskwebgui = False
     nopromptgen = False
     rngpersist  = False
     nogenmod    = False
@@ -378,6 +401,8 @@ class vars:
     output_streaming = True
     token_stream_queue = TokenStreamQueue() # Queue for the token streaming
     show_probs = False # Whether or not to show token probabilities
+    show_budget = False # Whether or not to show token probabilities
+    configname = None
 
 utils.vars = vars
 
@@ -386,7 +411,8 @@ class Send_to_socketio(object):
         print(bar, end="")
         time.sleep(0.01)
         try:
-            emit('from_server', {'cmd': 'model_load_status', 'data': bar.replace(" ", "&nbsp;")}, broadcast=True)
+            gui_msg = bar.replace(f"{colors.PURPLE}INIT{colors.END}       | ","").replace(" ", "&nbsp;")
+            emit('from_server', {'cmd': 'model_load_status', 'data': gui_msg}, broadcast=True)
         except:
             pass
                                 
@@ -395,8 +421,6 @@ import logging
 log = logging.getLogger('werkzeug')
 log.setLevel(logging.ERROR)
 
-# Start flask & SocketIO
-print("{0}Initializing Flask... {1}".format(colors.PURPLE, colors.END), end="")
 from flask import Flask, render_template, Response, request, copy_current_request_context, send_from_directory, session, jsonify, abort, redirect
 from flask_socketio import SocketIO
 from flask_socketio import emit as _emit
@@ -407,9 +431,7 @@ app = Flask(__name__, root_path=os.getcwd())
 app.secret_key = secrets.token_hex()
 app.config['SESSION_TYPE'] = 'filesystem'
 app.config['TEMPLATES_AUTO_RELOAD'] = True
-Session(app)
 socketio = SocketIO(app, async_method="eventlet")
-print("{0}OK!{1}".format(colors.GREEN, colors.END))
 
 old_socketio_on = socketio.on
 def new_socketio_on(*a, **k):
@@ -614,6 +636,18 @@ api_v1 = KoboldAPISpec(
     tags=tags,
 )
 
+# Returns the expected config filename for the current setup.
+# If the model_name is specified, it returns what the settings file would be for that model
+def get_config_filename(model_name = None):
+    if model_name:
+        return(f"settings/{model_name.replace('/', '_')}.settings")
+    elif args.configname:
+        return(f"settings/{args.configname.replace('/', '_')}.settings")
+    elif vars.configname != '':
+        return(f"settings/{vars.configname.replace('/', '_')}.settings")
+    else:
+        logger.warning(f"Empty configfile name sent back. Defaulting to ReadOnly")
+        return(f"settings/ReadOnly.settings")
 #==================================================================#
 # Function to get model selection at startup
 #==================================================================#
@@ -697,15 +731,7 @@ def getModelSelection(modellist):
                 getModelSelection(mainmenu)
 
 def check_if_dir_is_model(path):
-    if os.path.exists(path):
-        try:
-            from transformers import AutoConfig
-            model_config = AutoConfig.from_pretrained(path)
-        except:
-            return False
-        return True
-    else:
-        return False
+    return os.path.exists(os.path.join(path, 'config.json'))
     
 #==================================================================#
 # Return all keys in tokenizer dictionary containing char
@@ -721,9 +747,8 @@ def check_if_dir_is_model(path):
 # Return Model Name
 #==================================================================#
 def getmodelname():
-    if(args.configname):
-       modelname = args.configname
-       return modelname
+    if(vars.online_model != ''):
+       return(f"{vars.model}/{vars.online_model}")
     if(vars.model in ("NeoCustom", "GPT2Custom", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
         modelname = os.path.basename(os.path.normpath(vars.custmodpth))
         return modelname
@@ -781,14 +806,14 @@ def device_config(config):
                 breakmodel.disk_blocks = args.breakmodel_disklayers
                 n_layers -= args.breakmodel_disklayers
         except:
-            print("WARNING: --breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0.", file=sys.stderr)
+            logger.warning("--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0.")
             breakmodel.gpu_blocks = [n_layers]
             n_layers = 0
     elif(args.breakmodel_layers is not None):
         breakmodel.gpu_blocks = [n_layers - max(0, min(n_layers, args.breakmodel_layers))]
         n_layers -= sum(breakmodel.gpu_blocks)
     elif(args.model is not None):
-        print("Breakmodel not specified, assuming GPU 0")
+        logger.info("Breakmodel not specified, assuming GPU 0")
         breakmodel.gpu_blocks = [n_layers]
         n_layers = 0
     else:
@@ -847,7 +872,7 @@ def device_config(config):
                 else:
                     print(f"{colors.RED}Please enter an integer between -1 and {n_layers}.{colors.END}")
 
-    print(colors.PURPLE + "\nFinal device configuration:")
+    logger.init_ok("Final device configuration:", status="Info")
     device_list(n_layers, primary=breakmodel.primary_device)
 
     # If all layers are on the same device, use the old GPU generation mode
@@ -860,7 +885,7 @@ def device_config(config):
         return
 
     if(not breakmodel.gpu_blocks):
-        print("Nothing assigned to a GPU, reverting to CPU only mode")
+        logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
         import breakmodel
         breakmodel.primary_device = "cpu"
         vars.breakmodel = False
@@ -1040,6 +1065,7 @@ def savesettings():
     js["welcome"]     = vars.welcome
     js["output_streaming"] = vars.output_streaming
     js["show_probs"] = vars.show_probs
+    js["show_budget"] = vars.show_budget
 
     if(vars.seed_specified):
         js["seed"]    = vars.seed
@@ -1057,7 +1083,7 @@ def savesettings():
     # Write it
     if not os.path.exists('settings'):
         os.mkdir('settings')
-    file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "w")
+    file = open(get_config_filename(), "w")
     try:
         file.write(json.dumps(js, indent=3))
     finally:
@@ -1068,7 +1094,7 @@ def savesettings():
 #==================================================================#
 @debounce(2)
 def settingschanged():
-    print("{0}Saving settings!{1}".format(colors.GREEN, colors.END))
+    logger.info("Saving settings.")
     savesettings()
 
 #==================================================================#
@@ -1083,9 +1109,9 @@ def loadsettings():
         
         processsettings(js)
         file.close()
-    if(path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")):
+    if(path.exists(get_config_filename())):
         # Read file contents into JSON object
-        file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r")
+        file = open(get_config_filename(), "r")
         js   = json.load(file)
         
         processsettings(js)
@@ -1094,38 +1120,41 @@ def loadsettings():
 def processsettings(js):
 # Copy file contents to vars
     if("apikey" in js):
-        vars.apikey     = js["apikey"]
+        # If the model is the HORDE, then previously saved API key in settings
+        # Will always override a new key set.
+        if vars.model != "CLUSTER" or vars.apikey == '':
+            vars.apikey = js["apikey"]
     if("andepth" in js):
-        vars.andepth    = js["andepth"]
+        vars.andepth = js["andepth"]
     if("sampler_order" in js):
         sampler_order = vars.sampler_order
         if(len(sampler_order) < 7):
             sampler_order = [6] + sampler_order
         vars.sampler_order = sampler_order
     if("temp" in js):
-        vars.temp       = js["temp"]
+        vars.temp = js["temp"]
     if("top_p" in js):
-        vars.top_p      = js["top_p"]
+        vars.top_p = js["top_p"]
     if("top_k" in js):
-        vars.top_k      = js["top_k"]
+        vars.top_k = js["top_k"]
     if("tfs" in js):
-        vars.tfs        = js["tfs"]
+        vars.tfs = js["tfs"]
     if("typical" in js):
-        vars.typical    = js["typical"]
+        vars.typical = js["typical"]
     if("top_a" in js):
-        vars.top_a      = js["top_a"]
+        vars.top_a = js["top_a"]
     if("rep_pen" in js):
-        vars.rep_pen    = js["rep_pen"]
+        vars.rep_pen = js["rep_pen"]
     if("rep_pen_slope" in js):
         vars.rep_pen_slope = js["rep_pen_slope"]
     if("rep_pen_range" in js):
         vars.rep_pen_range = js["rep_pen_range"]
     if("genamt" in js):
-        vars.genamt     = js["genamt"]
+        vars.genamt = js["genamt"]
     if("max_length" in js):
         vars.max_length = js["max_length"]
     if("ikgen" in js):
-        vars.ikgen      = js["ikgen"]
+        vars.ikgen = js["ikgen"]
     if("formatoptns" in js):
         vars.formatoptns = js["formatoptns"]
     if("numseqs" in js):
@@ -1160,6 +1189,8 @@ def processsettings(js):
         vars.output_streaming = js["output_streaming"]
     if("show_probs" in js):
         vars.show_probs = js["show_probs"]
+    if("show_budget" in js):
+        vars.show_budget = js["show_budget"]
     
     if("seed" in js):
         vars.seed = js["seed"]
@@ -1288,6 +1319,8 @@ def general_startup(override_args=None):
     parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)")
     parser.add_argument("--model", help="Specify the Model Type to skip the Menu")
     parser.add_argument("--path", help="Specify the Path for local models (For model NeoCustom or GPT2Custom)")
+    parser.add_argument("--apikey", help="Specify the API key to use for online services")
+    parser.add_argument("--req_model", type=str, action='append', required=False, help="Which models which we allow to generate for us during cluster mode. Can be specified multiple times.")
     parser.add_argument("--revision", help="Specify the model revision for huggingface models (can be a git branch/tag name or a git commit hash)")
     parser.add_argument("--cpu", action='store_true', help="By default unattended launches are on the GPU use this option to force CPU usage.")
     parser.add_argument("--breakmodel", action='store_true', help=argparse.SUPPRESS)
@@ -1306,6 +1339,9 @@ def general_startup(override_args=None):
     parser.add_argument("--savemodel", action='store_true', help="Saves the model to the models folder even if --colab is used (Allows you to save models to Google Drive)")
     parser.add_argument("--customsettings", help="Preloads arguements from json file. You only need to provide the location of the json file. Use customsettings.json template file. It can be renamed if you wish so that you can store multiple configurations. Leave any settings you want as default as null. Any values you wish to set need to be in double quotation marks")
     parser.add_argument("--no_ui", action='store_true', default=False, help="Disables the GUI and Socket.IO server while leaving the API server running.")
+    parser.add_argument('-v', '--verbosity', action='count', default=0, help="The default logging level is ERROR or higher. This value increases the amount of logging seen in your screen")
+    parser.add_argument('-q', '--quiesce', action='count', default=0, help="The default logging level is ERROR or higher. This value decreases the amount of logging seen in your screen")
+
     #args: argparse.Namespace = None
     if "pytest" in sys.modules and override_args is None:
         args = parser.parse_args([])
@@ -1321,6 +1357,8 @@ def general_startup(override_args=None):
     
     utils.args = args
 
+    set_logger_verbosity(args.verbosity)
+    quiesce_logger(args.quiesce)
     if args.customsettings:
         f = open (args.customsettings)
         importedsettings = json.load(f)
@@ -1338,6 +1376,11 @@ def general_startup(override_args=None):
     vars.model = args.model;
     vars.revision = args.revision
 
+    if args.apikey:
+        vars.apikey = args.apikey
+    if args.req_model:
+        vars.cluster_requested_models = args.req_model
+
     if args.colab:
         args.remote = True;
         args.override_rename = True;
@@ -1383,9 +1426,10 @@ def general_startup(override_args=None):
             vars.model = "NeoCustom"
             vars.custmodpth = modpath
     elif args.model:
-        print("Welcome to KoboldAI!\nYou have selected the following Model:", vars.model)
+        logger.message(f"Welcome to KoboldAI!")
+        logger.message(f"You have selected the following Model: {vars.model}")
         if args.path:
-            print("You have selected the following path for your Model :", args.path)
+            logger.message(f"You have selected the following path for your Model: {args.path}")
             vars.custmodpth = args.path;
             vars.colaburl = args.path + "/request"; # Lets just use the same parameter to keep it simple
 #==================================================================#
@@ -1425,13 +1469,38 @@ def get_model_info(model, directory=""):
     key_value = ""
     break_values = []
     url = False
+    default_url = None
+    models_on_url = False
+    multi_online_models = False
     gpu_count = torch.cuda.device_count()
     gpu_names = []
+    send_horde_models = False
     for i in range(gpu_count):
         gpu_names.append(torch.cuda.get_device_name(i))
-    if model in [x[1] for x in model_menu['apilist']]:
-        if path.exists("settings/{}.settings".format(model)):
-            with open("settings/{}.settings".format(model), "r") as file:
+    if model in ['Colab', 'API']:
+        url = True
+    elif model == 'CLUSTER':
+        models_on_url = True
+        url = True
+        key = True
+        default_url = 'https://koboldai.net'
+        multi_online_models = True
+        if path.exists(get_config_filename(model)):
+            with open(get_config_filename(model), "r") as file:
+                # Check if API key exists
+                js = json.load(file)
+                if("apikey" in js and js["apikey"] != ""):
+                    # API key exists, grab it and close the file
+                    key_value = js["apikey"]
+                elif 'oaiapikey' in js and js['oaiapikey'] != "":
+                    key_value = js["oaiapikey"]
+                if 'url' in js and js['url'] != "":
+                    url = js['url']
+            if key_value != "":
+                send_horde_models = True
+    elif model in [x[1] for x in model_menu['apilist']]:
+        if path.exists(get_config_filename(model)):
+            with open(get_config_filename(model), "r") as file:
                 # Check if API key exists
                 js = json.load(file)
                 if("apikey" in js and js["apikey"] != ""):
@@ -1442,8 +1511,6 @@ def get_model_info(model, directory=""):
         key = True
     elif model == 'ReadOnly':
         pass
-    elif model == 'Colab':
-        url = True
     elif not utils.HAS_ACCELERATE and not torch.cuda.is_available():
         pass
     elif args.cpu:
@@ -1452,6 +1519,7 @@ def get_model_info(model, directory=""):
         layer_count = get_layer_count(model, directory=directory)
         if layer_count is None:
             breakmodel = False
+            gpu = True
         else:
             breakmodel = True
             if model in ["NeoCustom", "GPT2Custom"]:
@@ -1475,14 +1543,16 @@ def get_model_info(model, directory=""):
     emit('from_server', {'cmd': 'selected_model_info', 'key_value': key_value, 'key':key, 
                          'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel, 
                          'disk_break_value': disk_blocks, 'accelerate': utils.HAS_ACCELERATE,
-                         'break_values': break_values, 'gpu_count': gpu_count,
-                         'url': url, 'gpu_names': gpu_names}, broadcast=True)
-    if key_value != "":
+                         'break_values': break_values, 'gpu_count': gpu_count, 'multi_online_models': multi_online_models,
+                         'url': url, 'default_url': default_url, 'gpu_names': gpu_names, 'models_on_url': models_on_url}, broadcast=True)
+    if send_horde_models:
+        get_cluster_models({'key': key_value, 'url': default_url})
+    elif key_value != "" and model in [x[1] for x in model_menu['apilist']] and model != 'CLUSTER':
         get_oai_models(key_value)
     
 
 def get_layer_count(model, directory=""):
-    if(model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]):
+    if(model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]):
         if(model == "GPT2Custom"):
             with open(os.path.join(directory, "config.json"), "r") as f:
                 model_config = json.load(f)
@@ -1499,11 +1569,16 @@ def get_layer_count(model, directory=""):
                 model_config = AutoConfig.from_pretrained(directory, revision=vars.revision, cache_dir="cache")
             else:
                 model_config = AutoConfig.from_pretrained(model, revision=vars.revision, cache_dir="cache")
-        return utils.num_layers(model_config)
+        try:
+            if ((utils.HAS_ACCELERATE and model_config.model_type != 'gpt2') or model_config.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not vars.nobreakmodel:
+                return utils.num_layers(model_config)
+            else:
+                return None
+        except:
+            return None
     else:
         return None
 
-
 def get_oai_models(key):
     vars.oaiapikey = key
     if vars.model_selected == 'OAI':
@@ -1514,7 +1589,7 @@ def get_oai_models(key):
         return
         
     # Get list of models from OAI
-    print("{0}Retrieving engine list...{1}".format(colors.PURPLE, colors.END), end="")
+    logger.init("OAI Engines", status="Retrieving")
     req = requests.get(
         url, 
         headers = {
@@ -1526,7 +1601,7 @@ def get_oai_models(key):
         try:
             engines = [[en["id"], "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")] for en in engines]
         except:
-            print(engines)
+            logger.error(engines)
             raise
         
         online_model = ""
@@ -1537,26 +1612,86 @@ def get_oai_models(key):
             # If the client settings file doesn't exist, create it
             # Write API key to file
             os.makedirs('settings', exist_ok=True)
-        if path.exists("settings/{}.settings".format(vars.model_selected)):
-            with open("settings/{}.settings".format(vars.model_selected), "r") as file:
+        if path.exists(get_config_filename(vars.model_selected)):
+            with open(get_config_filename(vars.model_selected), "r") as file:
                 js = json.load(file)
                 if 'online_model' in js:
                     online_model = js['online_model']
                 if "apikey" in js:
                     if js['apikey'] != key:
                         changed=True
+        else:
+            changed=True
         if changed:
-            with open("settings/{}.settings".format(vars.model_selected), "w") as file:
+            js={}
+            with open(get_config_filename(vars.model_selected), "w") as file:
                 js["apikey"] = key
                 file.write(json.dumps(js, indent=3))
             
+        logger.init_ok("OAI Engines", status="OK")
         emit('from_server', {'cmd': 'oai_engines', 'data': engines, 'online_model': online_model}, broadcast=True)
     else:
         # Something went wrong, print the message and quit since we can't initialize an engine
-        print("{0}ERROR!{1}".format(colors.RED, colors.END))
-        print(req.json())
+        logger.init_err("OAI Engines", status="Failed")
+        logger.error(req.json())
         emit('from_server', {'cmd': 'errmsg', 'data': req.json()})
 
+def get_cluster_models(msg):
+    vars.oaiapikey = msg['key']
+    vars.apikey = vars.oaiapikey
+    url = msg['url']
+    # Get list of models from public cluster
+    logger.init("KAI Horde Models", status="Retrieving")
+    try:
+        req = requests.get("{}/models".format(url))
+    except requests.exceptions.ConnectionError:
+        logger.init_err("KAI Horde Models", status="Failed")
+        logger.error("Provided KoboldAI Horde URL unreachable")
+        emit('from_server', {'cmd': 'errmsg', 'data': "Provided KoboldAI Horde URL unreachable"})
+        return
+    if(not req.ok):
+        # Something went wrong, print the message and quit since we can't initialize an engine
+        logger.init_err("KAI Horde Models", status="Failed")
+        logger.error(req.json())
+        emit('from_server', {'cmd': 'errmsg', 'data': req.json()})
+        return
+
+    engines = req.json()
+    logger.debug(engines)
+    try:
+        engines = [[en, en] for en in engines]
+    except:
+        logger.error(engines)
+        raise
+    
+    online_model = ""
+    changed=False
+    
+    #Save the key
+    if not path.exists("settings"):
+        # If the client settings file doesn't exist, create it
+        # Write API key to file
+        os.makedirs('settings', exist_ok=True)
+    if path.exists(get_config_filename(vars.model_selected)):
+        with open(get_config_filename(vars.model_selected), "r") as file:
+            js = json.load(file)
+            if 'online_model' in js:
+                online_model = js['online_model']
+            if "apikey" in js:
+                if js['apikey'] != vars.oaiapikey:
+                    changed=True
+    else:
+        changed=True
+    if changed:
+        js={}
+        with open(get_config_filename(vars.model_selected), "w") as file:
+            js["apikey"] = vars.oaiapikey
+            js["url"] = url
+            file.write(json.dumps(js, indent=3))
+        
+    logger.init_ok("KAI Horde Models", status="OK")
+    emit('from_server', {'cmd': 'oai_engines', 'data': engines, 'online_model': online_model}, broadcast=True)
+
 
 # Function to patch transformers to use our soft prompt
 def patch_causallm(model):
@@ -1600,11 +1735,11 @@ def patch_transformers_download():
                     pass
     def http_get(
         url: str,
-        temp_file: transformers.utils.hub.BinaryIO,
+        temp_file,
         proxies=None,
         resume_size=0,
-        headers: transformers.utils.hub.Optional[transformers.utils.hub.Dict[str, str]] = None,
-        file_name: transformers.utils.hub.Optional[str] = None,
+        headers=None,
+        file_name=None,
     ):
         """
         Download remote file. Do not gobble up errors.
@@ -1866,6 +2001,8 @@ def patch_transformers():
             if not (vars.show_probs or vars.output_streaming):
                 return False
 
+            if vars.chatmode:
+                return False
             tokenizer_text = utils.decodenewlines(tokenizer.decode(input_ids[0, -1]))
             vars.token_stream_queue.add_text(tokenizer_text)
             return False
@@ -1956,19 +2093,23 @@ def reset_model_settings():
     vars.sampler_order = [6, 0, 1, 2, 3, 4, 5]
     vars.newlinemode = "n"
     vars.revision    = None
+    vars.lazy_load = True
+    
 
-def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model=""):
+def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False):
     global model
     global generator
     global torch
     global model_config
-    global GPT2TokenizerFast
+    global GPT2Tokenizer
     global tokenizer
+    if(initial_load):
+        use_breakmodel_args = True
     reset_model_settings()
     if not utils.HAS_ACCELERATE:
         disk_layers = None
     vars.noai = False
-    if not initial_load:
+    if not use_breakmodel_args:
         set_aibusy(True)
         if vars.model != 'ReadOnly':
             emit('from_server', {'cmd': 'model_load_status', 'data': "Loading {}".format(vars.model)}, broadcast=True)
@@ -1976,17 +2117,22 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
             time.sleep(0.1)
     if gpu_layers is not None:
         args.breakmodel_gpulayers = gpu_layers
-    elif initial_load:
+    elif use_breakmodel_args:
         gpu_layers = args.breakmodel_gpulayers
+    if breakmodel_args_default_to_cpu and gpu_layers is None:
+        gpu_layers = args.breakmodel_gpulayers = []
     if disk_layers is not None:
         args.breakmodel_disklayers = int(disk_layers)
-    elif initial_load:
+    elif use_breakmodel_args:
         disk_layers = args.breakmodel_disklayers
+    if breakmodel_args_default_to_cpu and disk_layers is None:
+        disk_layers = args.breakmodel_disklayers = 0
     
     #We need to wipe out the existing model and refresh the cuda cache
     model = None
     generator = None
     model_config = None
+    vars.online_model = ''
     with torch.no_grad():
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated")
@@ -2005,11 +2151,26 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
     #Reload our badwords
     vars.badwordsids = vars.badwordsids_default
     
+    if online_model == "":
+        vars.configname = getmodelname()
     #Let's set the GooseAI or OpenAI server URLs if that's applicable
-    if online_model != "":
-        if path.exists("settings/{}.settings".format(vars.model)):
+    else:
+        vars.online_model = online_model
+        # Swap OAI Server if GooseAI was selected
+        if(vars.model == "GooseAI"):
+            vars.oaiengines = "https://api.goose.ai/v1/engines"
+            vars.model = "OAI"
+            vars.configname = f"GooseAI_{online_model.replace('/', '_')}"
+        elif(vars.model == "CLUSTER") and type(online_model) is list:
+                if len(online_model) != 1:
+                    vars.configname = vars.model
+                else:
+                    vars.configname = f"{vars.model}_{online_model[0].replace('/', '_')}"
+        else:
+            vars.configname = f"{vars.model}_{online_model.replace('/', '_')}"
+        if path.exists(get_config_filename()):
             changed=False
-            with open("settings/{}.settings".format(vars.model), "r") as file:
+            with open(get_config_filename(), "r") as file:
                 # Check if API key exists
                 js = json.load(file)
                 if 'online_model' in js:
@@ -2020,20 +2181,21 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     changed=True
                     js['online_model'] = online_model
             if changed:
-                with open("settings/{}.settings".format(vars.model), "w") as file:
+                with open(get_config_filename(), "w") as file:
                     file.write(json.dumps(js, indent=3))
+
         # Swap OAI Server if GooseAI was selected
         if(vars.model == "GooseAI"):
             vars.oaiengines = "https://api.goose.ai/v1/engines"
             vars.model = "OAI"
             args.configname = "GooseAI" + "/" + online_model
-        else:
+        elif vars.model != "CLUSTER":
             args.configname = vars.model + "/" + online_model
         vars.oaiurl = vars.oaiengines + "/{0}/completions".format(online_model)
     
     
     # If transformers model was selected & GPU available, ask to use CPU or GPU
-    if(vars.model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+    if(vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
         vars.allowsp = True
         # Test for GPU support
         
@@ -2069,28 +2231,28 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
         elif(vars.model_type == "not_found" and vars.model == "GPT2Custom"):
             vars.model_type = "gpt2"
         elif(vars.model_type == "not_found"):
-            print("WARNING: No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)")
+            logger.warning("No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)")
             vars.model_type = "gpt_neo"
 
-    if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+    if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
         loadmodelsettings()
         loadsettings()
-        print("{0}Looking for GPU support...{1}".format(colors.PURPLE, colors.END), end="")
-        vars.hascuda = torch.cuda.is_available()
-        vars.bmsupported = (utils.HAS_ACCELERATE or vars.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not vars.nobreakmodel
+        logger.init("GPU support", status="Searching")
+        vars.hascuda = torch.cuda.is_available() and not args.cpu
+        vars.bmsupported = ((utils.HAS_ACCELERATE and vars.model_type != 'gpt2') or vars.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not vars.nobreakmodel
         if(args.breakmodel is not None and args.breakmodel):
-            print("WARNING: --breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --breakmodel_gpulayers is used (see --help for details).", file=sys.stderr)
+            logger.warning("--breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --breakmodel_gpulayers is used (see --help for details).")
         if(args.breakmodel_layers is not None):
-            print("WARNING: --breakmodel_layers is deprecated. Use --breakmodel_gpulayers instead (see --help for details).", file=sys.stderr)
+            logger.warning("--breakmodel_layers is deprecated. Use --breakmodel_gpulayers instead (see --help for details).")
         if(args.model and vars.bmsupported and not args.breakmodel_gpulayers and not args.breakmodel_layers and (not utils.HAS_ACCELERATE or not args.breakmodel_disklayers)):
-            print("WARNING: Model launched without the --breakmodel_gpulayers argument, defaulting to GPU only mode.", file=sys.stderr)
+            logger.warning("Model launched without the --breakmodel_gpulayers argument, defaulting to GPU only mode.")
             vars.bmsupported = False
         if(not vars.bmsupported and (args.breakmodel_gpulayers is not None or args.breakmodel_layers is not None or args.breakmodel_disklayers is not None)):
-            print("WARNING: This model does not support hybrid generation. --breakmodel_gpulayers will be ignored.", file=sys.stderr)
+            logger.warning("This model does not support hybrid generation. --breakmodel_gpulayers will be ignored.")
         if(vars.hascuda):
-            print("{0}FOUND!{1}".format(colors.GREEN, colors.END))
+            logger.init_ok("GPU support", status="Found")
         else:
-            print("{0}NOT FOUND!{1}".format(colors.YELLOW, colors.END))
+            logger.init_warn("GPU support", status="Not Found")
         
         if args.cpu:
             vars.usegpu = False
@@ -2103,7 +2265,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 vars.breakmodel = True
             else:
                 vars.breakmodel = False
-                vars.usegpu = True
+                vars.usegpu = use_gpu
 
 
     # Ask for API key if InferKit was selected
@@ -2114,20 +2276,20 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
     if(vars.model == "GooseAI"):
         vars.oaiengines = "https://api.goose.ai/v1/engines"
         vars.model = "OAI"
-        args.configname = "GooseAI"
+        vars.configname = "GooseAI"
 
     # Ask for API key if OpenAI was selected
     if(vars.model == "OAI"):
-        if not args.configname:
-            args.configname = "OAI"
+        if not vars.configname:
+            vars.configname = "OAI"
         
     if(vars.model == "ReadOnly"):
         vars.noai = True
 
     # Start transformers and create pipeline
-    if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+    if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
         if(not vars.noai):
-            print("{0}Initializing transformers, please wait...{1}".format(colors.PURPLE, colors.END))
+            logger.init("Transformers", status='Starting')
             for m in ("GPTJModel", "XGLMModel"):
                 try:
                     globals()[m] = getattr(__import__("transformers"), m)
@@ -2192,8 +2354,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
                         else:
                             num_tensors = len(device_map)
-                        print(flush=True)
-                        utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=Send_to_socketio())
+                        utils.bar = tqdm(total=num_tensors, desc=f"{colors.PURPLE}INIT{colors.END}       | Loading model tensors", file=Send_to_socketio())
 
                     with zipfile.ZipFile(f, "r") as z:
                         try:
@@ -2259,23 +2420,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 
 
             def get_hidden_size_from_model(model):
-                try:
-                    return int(model.model.decoder.project_in.in_features)
-                except:
-                    try:
-                        return int(model.model.decoder.embed_tokens.out_features)
-                    except:
-                        try:
-                            return int(model.transformer.hidden_size)
-                        except:
-                            try:
-                                return int(model.transformer.embed_dim)
-                            except:
-                                return int(model.lm_head.in_features)
+                return model.get_input_embeddings().embedding_dim
             
             def maybe_low_cpu_mem_usage() -> Dict[str, Any]:
                 if(packaging.version.parse(transformers_version) < packaging.version.parse("4.11.0")):
-                    print(f"\nWARNING:  Please upgrade to transformers 4.11.0 for lower RAM usage.  You have transformers {transformers_version}.", file=sys.stderr)
+                    logger.warning(f"Please upgrade to transformers 4.11.0 for lower RAM usage. You have transformers {transformers_version}.")
                     return {}
                 return {"low_cpu_mem_usage": True}
             
@@ -2290,18 +2439,33 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     yield False
 
             # If custom GPT2 model was chosen
-            if(vars.model == "GPT2Custom"):
+            if(vars.model_type == "gpt2"):
                 vars.lazy_load = False
-                model_config = open(vars.custmodpth + "/config.json", "r")
-                js   = json.load(model_config)
+                if os.path.exists(vars.custmodpth):
+                    model_config = open(vars.custmodpth + "/config.json", "r")
+                elif os.path.exists(os.path.join("models/", vars.custmodpth)):
+                    config_path = os.path.join("models/", vars.custmodpth)
+                    config_path = os.path.join(config_path, "config.json").replace("\\", "//")
+                    model_config = open(config_path, "r")
+                #js   = json.load(model_config)
                 with(maybe_use_float16()):
                     try:
-                        model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                        if os.path.exists(vars.custmodpth):
+                            model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                            tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                        elif os.path.exists(os.path.join("models/", vars.custmodpth)):
+                            model = GPT2LMHeadModel.from_pretrained(os.path.join("models/", vars.custmodpth), revision=vars.revision, cache_dir="cache")
+                            tokenizer = GPT2Tokenizer.from_pretrained(os.path.join("models/", vars.custmodpth), revision=vars.revision, cache_dir="cache")
+                        else:
+                            model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                            tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
                     except Exception as e:
                         if("out of memory" in traceback.format_exc().lower()):
                             raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
                         raise e
-                tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                model.save_pretrained("models/{}".format(vars.model.replace('/', '_')), max_shard_size="500MiB")
+                tokenizer.save_pretrained("models/{}".format(vars.model.replace('/', '_')))
                 vars.modeldim = get_hidden_size_from_model(model)
                 # Is CUDA available? If so, use GPU, otherwise fall back to CPU
                 if(vars.hascuda and vars.usegpu):
@@ -2324,7 +2488,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 # If we're using torch_lazy_loader, we need to get breakmodel config
                 # early so that it knows where to load the individual model tensors
                 if (utils.HAS_ACCELERATE or vars.lazy_load and vars.hascuda and vars.breakmodel) and not vars.nobreakmodel:
-                    print(1)
                     device_config(model_config)
 
                 # Download model from Huggingface if it does not exist, otherwise load locally
@@ -2333,7 +2496,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 if os.path.isdir(vars.model.replace('/', '_')):
                     import shutil
                     shutil.move(vars.model.replace('/', '_'), "models/{}".format(vars.model.replace('/', '_')))
-                print("\n", flush=True)
                 if(vars.lazy_load):  # If we're using lazy loader, we need to figure out what the model's hidden layers are called
                     with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True):
                         try:
@@ -2347,17 +2509,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     if(vars.lazy_load):  # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
                         lowmem = {}
                     if(os.path.isdir(vars.custmodpth)):
-                        try:
-                            tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
-                        except Exception as e:
-                            pass
                         try:
                             tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", use_fast=False)
                         except Exception as e:
                             try:
-                                tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                                tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
                             except Exception as e:
-                                tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+                                try:
+                                    tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                                except Exception as e:
+                                    tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
                         try:
                             model     = AutoModelForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
                         except Exception as e:
@@ -2365,17 +2526,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
                             model     = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", **lowmem)
                     elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
-                        try:
-                            tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
-                        except Exception as e:
-                            pass
                         try:
                             tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", use_fast=False)
                         except Exception as e:
                             try:
-                                tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
+                                tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
                             except Exception as e:
-                                tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+                                try:
+                                    tokenizer = GPT2Tokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
+                                except Exception as e:
+                                    tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
                         try:
                             model     = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", **lowmem)
                         except Exception as e:
@@ -2396,17 +2556,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                             return old_rebuild_tensor(storage, storage_offset, shape, stride)
                         torch._utils._rebuild_tensor = new_rebuild_tensor
 
-                        try:
-                            tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
-                        except Exception as e:
-                            pass
                         try:
                             tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", use_fast=False)
                         except Exception as e:
                             try:
-                                tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
+                                tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
                             except Exception as e:
-                                tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+                                try:
+                                    tokenizer = GPT2Tokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
+                                except Exception as e:
+                                    tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
                         try:
                             model     = AutoModelForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", **lowmem)
                         except Exception as e:
@@ -2426,20 +2585,22 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                                 import transformers.configuration_utils
                                 import transformers.modeling_utils
                                 import transformers.file_utils
+                                import huggingface_hub
+                                legacy = packaging.version.parse(transformers_version) < packaging.version.parse("4.22.0.dev0")
                                 # Save the config.json
-                                shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, transformers.configuration_utils.CONFIG_NAME, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.configuration_utils.CONFIG_NAME))
+                                shutil.move(os.path.realpath(huggingface_hub.hf_hub_download(vars.model, transformers.configuration_utils.CONFIG_NAME, revision=vars.revision, cache_dir="cache", local_files_only=True, legacy_cache_layout=legacy)), os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.configuration_utils.CONFIG_NAME))
                                 if(utils.num_shards is None):
                                     # Save the pytorch_model.bin of an unsharded model
-                                    shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, transformers.modeling_utils.WEIGHTS_NAME, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.modeling_utils.WEIGHTS_NAME))
+                                    shutil.move(os.path.realpath(huggingface_hub.hf_hub_download(vars.model, transformers.modeling_utils.WEIGHTS_NAME, revision=vars.revision, cache_dir="cache", local_files_only=True, legacy_cache_layout=legacy)), os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.modeling_utils.WEIGHTS_NAME))
                                 else:
                                     with open(utils.from_pretrained_index_filename) as f:
                                         map_data = json.load(f)
                                     filenames = set(map_data["weight_map"].values())
                                     # Save the pytorch_model.bin.index.json of a sharded model
-                                    shutil.move(utils.from_pretrained_index_filename, os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.modeling_utils.WEIGHTS_INDEX_NAME))
+                                    shutil.move(os.path.realpath(utils.from_pretrained_index_filename), os.path.join("models/{}".format(vars.model.replace('/', '_')), transformers.modeling_utils.WEIGHTS_INDEX_NAME))
                                     # Then save the pytorch_model-#####-of-#####.bin files
                                     for filename in filenames:
-                                        shutil.move(transformers.file_utils.get_from_cache(transformers.file_utils.hf_bucket_url(vars.model, filename, revision=vars.revision), cache_dir="cache", local_files_only=True), os.path.join("models/{}".format(vars.model.replace('/', '_')), filename))
+                                        shutil.move(os.path.realpath(huggingface_hub.hf_hub_download(vars.model, filename, revision=vars.revision, cache_dir="cache", local_files_only=True, legacy_cache_layout=legacy)), os.path.join("models/{}".format(vars.model.replace('/', '_')), filename))
                             shutil.rmtree("cache/")
 
                 if(vars.badwordsids is vars.badwordsids_default and vars.model_type not in ("gpt2", "gpt_neo", "gptj")):
@@ -2455,7 +2616,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                     elif(vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
                         vars.modeldim = get_hidden_size_from_model(model)
                         if(not vars.lazy_load):
-                            print(2)
                             device_config(model.config)
                         move_model_to_devices(model)
                     elif(utils.HAS_ACCELERATE and __import__("breakmodel").disk_blocks > 0):
@@ -2482,11 +2642,11 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
             #for key in vars.badwords:
             #    vars.badwordsids.append([vocab[key]])
             
-            print("{0}OK! {1} pipeline created!{2}".format(colors.GREEN, vars.model, colors.END))
+            logger.info(f"Pipeline created: {vars.model}")
         
         else:
-            from transformers import GPT2TokenizerFast
-            tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+            from transformers import GPT2Tokenizer
+            tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
     else:
         from transformers import PreTrainedModel
         from transformers import modeling_utils
@@ -2583,13 +2743,13 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
             }
 
         # If we're running Colab or OAI, we still need a tokenizer.
-        if(vars.model in ("Colab", "API")):
-            from transformers import GPT2TokenizerFast
-            tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-neo-2.7B", revision=vars.revision, cache_dir="cache")
+        if(vars.model in ("Colab", "API", "CLUSTER")):
+            from transformers import GPT2Tokenizer
+            tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B", revision=vars.revision, cache_dir="cache")
             loadsettings()
         elif(vars.model == "OAI"):
-            from transformers import GPT2TokenizerFast
-            tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+            from transformers import GPT2Tokenizer
+            tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
             loadsettings()
         # Load the TPU backend if requested
         elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
@@ -2642,10 +2802,8 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
 def index():
     if args.no_ui:
         return redirect('/api/latest')
-    if 'new_ui' in request.args:
-        return render_template('index_new.html', hide_ai_menu=args.noaimenu)
     else:
-        return render_template('index.html', hide_ai_menu=args.noaimenu, flaskwebgui=vars.flaskwebgui)
+        return render_template('index.html', hide_ai_menu=args.noaimenu)
 @app.route('/api', strict_slashes=False)
 def api():
     return redirect('/api/latest')
@@ -2708,8 +2866,8 @@ def lua_startup():
     global _bridged
     global F
     global bridged
-    if(path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")):
-        file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r")
+    if(path.exists(get_config_filename())):
+        file = open(get_config_filename(), "r")
         js   = json.load(file)
         if("userscripts" in js):
             vars.userscripts = []
@@ -2730,7 +2888,7 @@ def lua_startup():
     #==================================================================#
 
     print("", end="", flush=True)
-    print(colors.PURPLE + "Initializing Lua Bridge... " + colors.END, end="", flush=True)
+    logger.init("LUA bridge", status="Starting")
 
     # Set up Lua state
     vars.lua_state = lupa.LuaRuntime(unpack_returned_tuples=True)
@@ -2753,10 +2911,10 @@ def lua_startup():
     except lupa.LuaError as e:
         print(colors.RED + "ERROR!" + colors.END)
         vars.lua_koboldbridge.obliterate_multiverse()
-        print("{0}{1}{2}".format(colors.RED, "***LUA ERROR***: ", colors.END), end="", file=sys.stderr)
-        print("{0}{1}{2}".format(colors.RED, str(e).replace("\033", ""), colors.END), file=sys.stderr)
+        logger.debug('LUA ERROR: ' + str(e).replace("\033", ""))
+        logger.warning("Lua engine stopped; please open 'Userscripts' and press Load to reinitialize scripts.")
         exit(1)
-    print(colors.GREEN + "OK!" + colors.END)
+    logger.init_ok("LUA bridge", status="OK")
 
 
 def lua_log_format_name(name):
@@ -2780,7 +2938,7 @@ def load_callback(filename, modulename):
 #  Load all Lua scripts
 #==================================================================#
 def load_lua_scripts():
-    print(colors.GREEN + "Loading Core Script" + colors.END)
+    logger.init("LUA Scripts", status="Starting")
 
     filenames = []
     modulenames = []
@@ -2812,11 +2970,11 @@ def load_lua_scripts():
         if(vars.serverstarted):
             emit('from_server', {'cmd': 'errmsg', 'data': 'Lua script error; please check console.'}, broadcast=True)
             sendUSStatItems()
-        print("{0}{1}{2}".format(colors.RED, "***LUA ERROR***: ", colors.END), end="", file=sys.stderr)
-        print("{0}{1}{2}".format(colors.RED, str(e).replace("\033", ""), colors.END), file=sys.stderr)
-        print("{0}{1}{2}".format(colors.YELLOW, "Lua engine stopped; please open 'Userscripts' and press Load to reinitialize scripts.", colors.END), file=sys.stderr)
+        logger.debug('LUA ERROR: ' + str(e).replace("\033", ""))
+        logger.warning("Lua engine stopped; please open 'Userscripts' and press Load to reinitialize scripts.")
         if(vars.serverstarted):
             set_aibusy(0)
+    logger.init_ok("LUA Scripts", status="OK")
 
 #==================================================================#
 #  Print message that originates from the userscript with the given name
@@ -2846,9 +3004,9 @@ def lua_decode(tokens):
     tokens = list(tokens.values())
     assert type(tokens) is list
     if("tokenizer" not in globals()):
-        from transformers import GPT2TokenizerFast
+        from transformers import GPT2Tokenizer
         global tokenizer
-        tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
     return utils.decodenewlines(tokenizer.decode(tokens))
 
 #==================================================================#
@@ -2858,9 +3016,9 @@ def lua_decode(tokens):
 def lua_encode(string):
     assert type(string) is str
     if("tokenizer" not in globals()):
-        from transformers import GPT2TokenizerFast
+        from transformers import GPT2Tokenizer
         global tokenizer
-        tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
     return tokenizer.encode(utils.encodenewlines(string), max_length=int(4e9), truncation=True)
 
 #==================================================================#
@@ -3229,7 +3387,7 @@ def lua_set_chunk(k, v):
 def lua_get_modeltype():
     if(vars.noai):
         return "readonly"
-    if(vars.model in ("Colab", "API", "OAI", "InferKit")):
+    if(vars.model in ("Colab", "API", "CLUSTER", "OAI", "InferKit")):
         return "api"
     if(not vars.use_colab_tpu and vars.model not in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") and (vars.model in ("GPT2Custom", "NeoCustom") or vars.model_type in ("gpt2", "gpt_neo", "gptj"))):
         hidden_size = get_hidden_size_from_model(model)
@@ -3258,7 +3416,7 @@ def lua_get_modeltype():
 def lua_get_modelbackend():
     if(vars.noai):
         return "readonly"
-    if(vars.model in ("Colab", "API", "OAI", "InferKit")):
+    if(vars.model in ("Colab", "API", "CLUSTER", "OAI", "InferKit")):
         return "api"
     if(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
         return "mtj"
@@ -3309,9 +3467,8 @@ def execute_inmod():
         vars.lua_running = False
         emit('from_server', {'cmd': 'errmsg', 'data': 'Lua script error; please check console.'}, broadcast=True)
         sendUSStatItems()
-        print("{0}{1}{2}".format(colors.RED, "***LUA ERROR***: ", colors.END), end="", file=sys.stderr)
-        print("{0}{1}{2}".format(colors.RED, str(e).replace("\033", ""), colors.END), file=sys.stderr)
-        print("{0}{1}{2}".format(colors.YELLOW, "Lua engine stopped; please open 'Userscripts' and press Load to reinitialize scripts.", colors.END), file=sys.stderr)
+        logger.debug('LUA ERROR: ' + str(e).replace("\033", ""))
+        logger.warning("Lua engine stopped; please open 'Userscripts' and press Load to reinitialize scripts.")
         set_aibusy(0)
 
 def execute_genmod():
@@ -3327,9 +3484,8 @@ def execute_outmod():
         vars.lua_running = False
         emit('from_server', {'cmd': 'errmsg', 'data': 'Lua script error; please check console.'}, broadcast=True)
         sendUSStatItems()
-        print("{0}{1}{2}".format(colors.RED, "***LUA ERROR***: ", colors.END), end="", file=sys.stderr)
-        print("{0}{1}{2}".format(colors.RED, str(e).replace("\033", ""), colors.END), file=sys.stderr)
-        print("{0}{1}{2}".format(colors.YELLOW, "Lua engine stopped; please open 'Userscripts' and press Load to reinitialize scripts.", colors.END), file=sys.stderr)
+        logger.debug('LUA ERROR: ' + str(e).replace("\033", ""))
+        logger.warning("Lua engine stopped; please open 'Userscripts' and press Load to reinitialize scripts.")
         set_aibusy(0)
     if(vars.lua_koboldbridge.resend_settings_required):
         vars.lua_koboldbridge.resend_settings_required = False
@@ -3349,14 +3505,12 @@ def execute_outmod():
 #==================================================================#
 @socketio.on('connect')
 def do_connect():
-    print("{0}Client connected!{1}".format(colors.GREEN, colors.END))
+    logger.info("Client connected!")
     emit('from_server', {'cmd': 'setchatname', 'data': vars.chatname})
     emit('from_server', {'cmd': 'setanotetemplate', 'data': vars.authornotetemplate})
     emit('from_server', {'cmd': 'connected', 'smandelete': vars.smandelete, 'smanrename': vars.smanrename, 'modelname': getmodelname()})
     if(vars.host):
         emit('from_server', {'cmd': 'runs_remotely'})
-    if(vars.flaskwebgui):
-        emit('from_server', {'cmd': 'flaskwebgui'})
     if(vars.allowsp):
         emit('from_server', {'cmd': 'allowsp', 'data': vars.allowsp})
 
@@ -3402,7 +3556,7 @@ def do_connect():
 @socketio.on('message')
 def get_message(msg):
     if not vars.quiet:
-        print("{0}Data received:{1}{2}".format(colors.GREEN, msg, colors.END))
+        logger.debug(f"Data received: {msg}")
     # Submit action
     if(msg['cmd'] == 'submit'):
         if(vars.mode == "play"):
@@ -3692,6 +3846,7 @@ def get_message(msg):
     elif(msg['cmd'] == 'list_model'):
         sendModelSelection(menu=msg['data'])
     elif(msg['cmd'] == 'load_model'):
+        logger.debug(f"Selected Model: {vars.model_selected}")
         if not os.path.exists("settings/"):
             os.mkdir("settings")
         changed = True
@@ -3715,9 +3870,17 @@ def get_message(msg):
             f.close()
         vars.colaburl = msg['url'] + "/request"
         vars.model = vars.model_selected
+        if vars.model == "CLUSTER":
+            if type(msg['online_model']) is not list:
+                if msg['online_model'] == '':
+                    vars.cluster_requested_models = []
+                else:
+                    vars.cluster_requested_models = [msg['online_model']]
+            else:
+                vars.cluster_requested_models = msg['online_model']
         load_model(use_gpu=msg['use_gpu'], gpu_layers=msg['gpu_layers'], disk_layers=msg['disk_layers'], online_model=msg['online_model'])
     elif(msg['cmd'] == 'show_model'):
-        print("Model Name: {}".format(getmodelname()))
+        logger.info(f"Model Name: {getmodelname()}")
         emit('from_server', {'cmd': 'show_model_name', 'data': getmodelname()}, broadcast=True)
     elif(msg['cmd'] == 'selectmodel'):
         # This is run when a model line is selected from the UI (line from the model_menu variable) that is tagged as not a menu
@@ -3759,7 +3922,7 @@ def get_message(msg):
                 else:
                     sendModelSelection(menu=msg['data'], folder=msg['path'])
         else:
-            vars.model_selected = msg['data']
+            vars.model_selected = msg['data'] 
             if 'path' in msg:
                 vars.custmodpth = msg['path']
                 get_model_info(msg['data'], directory=msg['path'])
@@ -3768,16 +3931,18 @@ def get_message(msg):
     elif(msg['cmd'] == 'delete_model'):
         if "{}/models".format(os.getcwd()) in os.path.abspath(msg['data']) or "{}\\models".format(os.getcwd()) in os.path.abspath(msg['data']):
             if check_if_dir_is_model(msg['data']):
-                print(colors.YELLOW + "WARNING: Someone deleted " + msg['data'])
+                logger.warning(f"Someone deleted {msg['data']}")
                 import shutil
                 shutil.rmtree(msg['data'])
                 sendModelSelection(menu=msg['menu'])
             else:
-                print(colors.RED + "ERROR: Someone attempted to delete " + msg['data'] + " but this is not a valid model")
+                logger.error(f"Someone attempted to delete {msg['data']} but this is not a valid model")
         else:
-            print(colors.RED + "WARNING!!: Someone maliciously attempted to delete " + msg['data'] + " the attempt has been blocked.")
+            logger.critical(f"Someone maliciously attempted to delete {msg['data']}. The attempt has been blocked.")
     elif(msg['cmd'] == 'OAI_Key_Update'):
         get_oai_models(msg['key'])
+    elif(msg['cmd'] == 'Cluster_Key_Update'):
+        get_cluster_models(msg)
     elif(msg['cmd'] == 'loadselect'):
         vars.loadselect = msg["data"]
     elif(msg['cmd'] == 'spselect'):
@@ -3849,6 +4014,10 @@ def get_message(msg):
         vars.output_streaming = msg['data']
         settingschanged()
         refresh_settings()
+    elif(msg['cmd'] == 'setshowbudget'):
+        vars.show_budget = msg['data']
+        settingschanged()
+        refresh_settings()
     elif(msg['cmd'] == 'setshowprobs'):
         vars.show_probs = msg['data']
         settingschanged()
@@ -3971,19 +4140,27 @@ def check_for_backend_compilation():
             break
     vars.checking = False
 
-def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False, disable_recentrng=False, no_generate=False):
+def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False, disable_recentrng=False, no_generate=False, ignore_aibusy=False):
     # Ignore new submissions if the AI is currently busy
-    if(vars.aibusy):
+    if(not ignore_aibusy and vars.aibusy):
         return
     
     while(True):
         set_aibusy(1)
 
-        if(vars.model == "API"):
+        if(vars.model in ["API","CLUSTER"]):
             global tokenizer
-            tokenizer_id = requests.get(
-                vars.colaburl[:-8] + "/api/v1/model",
-            ).json()["result"]
+            if vars.model == "API":
+                tokenizer_id = requests.get(
+                    vars.colaburl[:-8] + "/api/v1/model",
+                ).json()["result"]
+            elif len(vars.cluster_requested_models) >= 1:
+                # If the player has requested one or more models, we use the first one for the tokenizer
+                tokenizer_id = vars.cluster_requested_models[0]
+            # The cluster can return any number of possible models for each gen, but this happens after this step
+            # So at this point, this is unknown
+            else:
+                tokenizer_id = ""
             if tokenizer_id != vars.api_tokenizer_id:
                 try:
                     if(os.path.isdir(tokenizer_id)):
@@ -4002,7 +4179,7 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False,
                         except:
                             tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, revision=vars.revision, cache_dir="cache", use_fast=False)
                 except:
-                    print(f"WARNING:  Unknown tokenizer {repr(tokenizer_id)}")
+                    logger.warning(f"Unknown tokenizer {repr(tokenizer_id)}")
                 vars.api_tokenizer_id = tokenizer_id
 
         if(disable_recentrng):
@@ -4113,8 +4290,8 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False,
                         try:
                             alternatives = [item['Text'] for item in vars.actions_metadata[len(vars.actions)-1]["Alternative Text"]]
                         except:
-                            print(len(vars.actions))
-                            print(vars.actions_metadata)
+                            logger.debug(len(vars.actions))
+                            logger.debug(vars.actions_metadata)
                             raise
                         if data in alternatives:
                             alternatives = [item for item in vars.actions_metadata[vars.actions.get_last_key() ]["Alternative Text"] if item['Text'] != data]
@@ -4166,7 +4343,8 @@ def apiactionsubmit_generate(txt, minimum, maximum):
     vars.generated_tkns = 0
 
     if not vars.quiet:
-        print("{0}Min:{1}, Max:{2}, Txt:{3}{4}".format(colors.YELLOW, minimum, maximum, utils.decodenewlines(tokenizer.decode(txt)), colors.END))
+        logger.debug(f"Prompt Min:{minimum}, Max:{maximum}")
+        logger.prompt(utils.decodenewlines(tokenizer.decode(txt)).encode("unicode_escape").decode("utf-8"))
 
     # Clear CUDA cache if using GPU
     if(vars.hascuda and (vars.usegpu or vars.breakmodel)):
@@ -4193,7 +4371,8 @@ def apiactionsubmit_tpumtjgenerate(txt, minimum, maximum):
         tpu_mtj_backend.set_rng_seed(vars.seed)
 
     if not vars.quiet:
-        print("{0}Min:{1}, Max:{2}, Txt:{3}{4}".format(colors.YELLOW, minimum, maximum, utils.decodenewlines(tokenizer.decode(txt)), colors.END))
+        logger.debug(f"Prompt Min:{minimum}, Max:{maximum}")
+        logger.prompt(utils.decodenewlines(tokenizer.decode(txt)).encode("unicode_escape").decode("utf-8"))
 
     vars._actions = vars.actions
     vars._prompt = vars.prompt
@@ -4229,6 +4408,8 @@ def apiactionsubmit(data, use_memory=False, use_world_info=False, use_story=Fals
         raise NotImplementedError("API generation is not supported in old Colab API mode.")
     elif(vars.model == "API"):
         raise NotImplementedError("API generation is not supported in API mode.")
+    elif(vars.model == "CLUSTER"):
+        raise NotImplementedError("API generation is not supported in API mode.")
     elif(vars.model == "OAI"):
         raise NotImplementedError("API generation is not supported in OpenAI/GooseAI mode.")
     elif(vars.model == "ReadOnly"):
@@ -4279,7 +4460,7 @@ def apiactionsubmit(data, use_memory=False, use_world_info=False, use_story=Fals
     minimum = len(tokens) + 1
     maximum = len(tokens) + vars.genamt
 
-    if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+    if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "CLUSTER", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
         genout = apiactionsubmit_generate(tokens, minimum, maximum)
     elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
         genout = apiactionsubmit_tpumtjgenerate(tokens, minimum, maximum)
@@ -4406,9 +4587,9 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
     lnsp = vars.sp_length
 
     if("tokenizer" not in globals()):
-        from transformers import GPT2TokenizerFast
+        from transformers import GPT2Tokenizer
         global tokenizer
-        tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
 
     lnheader = len(tokenizer._koboldai_header)
 
@@ -4447,7 +4628,7 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
 
     if(actionlen == 0):
         # First/Prompt action
-        tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns
+        tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns
         assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction
         ln = len(tokens) + lnsp
         return tokens, ln+1, ln+vars.genamt
@@ -4495,12 +4676,12 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
         # Did we get to add the A.N.? If not, do it here
         if(anotetxt != ""):
             if((not anoteadded) or forceanote):
-                tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns + tokens
+                tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns + tokens
             else:
-                tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + prompttkns + tokens
+                tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + prompttkns + tokens
         else:
             # Prepend Memory, WI, and Prompt before action tokens
-            tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + prompttkns + tokens
+            tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + prompttkns + tokens
 
         # Send completed bundle to generator
         assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction
@@ -4522,23 +4703,27 @@ def calcsubmit(txt):
     if(vars.model != "InferKit"):
         subtxt, min, max = calcsubmitbudget(actionlen, winfo, mem, anotetxt, vars.actions, submission=txt)
         if(actionlen == 0):
-            if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+            if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "CLUSTER", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
                 generate(subtxt, min, max, found_entries=found_entries)
             elif(vars.model == "Colab"):
                 sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
             elif(vars.model == "API"):
                 sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
+            elif(vars.model == "CLUSTER"):
+                sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
             elif(vars.model == "OAI"):
                 oairequest(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
             elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
                 tpumtjgenerate(subtxt, min, max, found_entries=found_entries)
         else:
-            if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+            if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "CLUSTER", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
                 generate(subtxt, min, max, found_entries=found_entries)
             elif(vars.model == "Colab"):
                 sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
             elif(vars.model == "API"):
                 sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
+            elif(vars.model == "CLUSTER"):
+                sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
             elif(vars.model == "OAI"):
                 oairequest(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
             elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
@@ -4699,7 +4884,8 @@ def generate(txt, minimum, maximum, found_entries=None):
     found_entries = tuple(found_entries.copy() for _ in range(vars.numseqs))
 
     if not vars.quiet:
-        print("{0}Min:{1}, Max:{2}, Txt:{3}{4}".format(colors.YELLOW, minimum, maximum, utils.decodenewlines(tokenizer.decode(txt)), colors.END))
+        logger.debug(f"Prompt Min:{minimum}, Max:{maximum}")
+        logger.prompt(utils.decodenewlines(tokenizer.decode(txt)).encode("unicode_escape").decode("utf-8"))
 
     # Store context in memory to use it for comparison with generated content
     vars.lastctx = utils.decodenewlines(tokenizer.decode(txt))
@@ -4718,12 +4904,11 @@ def generate(txt, minimum, maximum, found_entries=None):
             vars.lua_running = False
             emit('from_server', {'cmd': 'errmsg', 'data': 'Lua script error; please check console.'}, broadcast=True)
             sendUSStatItems()
-            print("{0}{1}{2}".format(colors.RED, "***LUA ERROR***: ", colors.END), end="", file=sys.stderr)
-            print("{0}{1}{2}".format(colors.RED, str(e).replace("\033", ""), colors.END), file=sys.stderr)
-            print("{0}{1}{2}".format(colors.YELLOW, "Lua engine stopped; please open 'Userscripts' and press Load to reinitialize scripts.", colors.END), file=sys.stderr)
+            logger.debug('LUA ERROR: ' + str(e).replace("\033", ""))
+            logger.warning("Lua engine stopped; please open 'Userscripts' and press Load to reinitialize scripts.")
         else:
             emit('from_server', {'cmd': 'errmsg', 'data': 'Error occurred during generator call; please check console.'}, broadcast=True)
-            print("{0}{1}{2}".format(colors.RED, traceback.format_exc().replace("\033", ""), colors.END), file=sys.stderr)
+            logger.error(traceback.format_exc().replace("\033", ""))
         set_aibusy(0)
         return
 
@@ -4762,7 +4947,7 @@ def generate(txt, minimum, maximum, found_entries=None):
 #==================================================================#
 def genresult(genout, flash=True, ignore_formatting=False):
     if not vars.quiet:
-        print("{0}{1}{2}".format(colors.CYAN, genout, colors.END))
+        logger.generation(genout.encode("unicode_escape").decode("utf-8"))
     
     # Format output before continuing
     if not ignore_formatting:
@@ -4796,7 +4981,8 @@ def genselect(genout):
         # Apply output formatting rules to sequences
         result["generated_text"] = applyoutputformatting(result["generated_text"])
         if not vars.quiet:
-            print("{0}[Result {1}]\n{2}{3}".format(colors.CYAN, i, result["generated_text"], colors.END))
+            logger.info(f"Generation Result {i}")
+            logger.generation(result["generated_text"].encode("unicode_escape").decode("utf-8"))
         i += 1
     
     # Add the options to the actions metadata
@@ -5006,18 +5192,122 @@ def sendtoapi(txt, min, max):
         if(len(genout) == 1):
             genresult(genout[0])
         else:
+            adjusted_genout = []
+            for item in genout:
+                adjusted_genout.append({"generated_text": item})
             # Convert torch output format to transformers
             seqs = []
-            for seq in genout:
+            for seq in adjusted_genout:
                 seqs.append({"generated_text": seq})
             if(vars.lua_koboldbridge.restart_sequence is not None and vars.lua_koboldbridge.restart_sequence > 0):
-                genresult(genout[vars.lua_koboldbridge.restart_sequence-1]["generated_text"])
+                genresult(adjusted_genout[vars.lua_koboldbridge.restart_sequence-1]["generated_text"])
             else:
-                genselect(genout)
+                genselect(adjusted_genout)
 
         set_aibusy(0)
         return
 
+#==================================================================#
+#  Send transformers-style request to KoboldAI Cluster
+#==================================================================#
+def sendtocluster(txt, min, max):
+    # Log request to console
+    if not vars.quiet:
+        logger.debug(f"Tokens Min:{min-1}")
+        logger.prompt(txt.encode("unicode_escape").decode("utf-8"))
+
+    # Store context in memory to use it for comparison with generated content
+    vars.lastctx = txt
+    # Build request JSON data
+    reqdata = {
+        'max_length': max - min + 1,
+        'max_context_length': vars.max_length,
+        'rep_pen': vars.rep_pen,
+        'rep_pen_slope': vars.rep_pen_slope,
+        'rep_pen_range': vars.rep_pen_range,
+        'temperature': vars.temp,
+        'top_p': vars.top_p,
+        'top_k': vars.top_k,
+        'top_a': vars.top_a,
+        'tfs': vars.tfs,
+        'typical': vars.typical,
+        'n': vars.numseqs,
+    }
+    cluster_metadata = {
+        'prompt': txt,
+        'params': reqdata,
+        'api_key': vars.apikey,
+        'models': vars.cluster_requested_models,
+    }
+    logger.debug(f"Horde Payload: {cluster_metadata}")
+    try:
+        # Create request
+        req = requests.post(
+            vars.colaburl[:-8] + "/api/v1/generate/sync",
+            json=cluster_metadata,
+        )
+    except requests.exceptions.ConnectionError:
+        errmsg = f"Horde unavailable. Please try again later"
+        logger.error(errmsg)
+        emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True)
+        set_aibusy(0)
+        return
+    if(req.status_code == 503):
+        errmsg = f"KoboldAI API Error: No available KoboldAI servers found in Horde to fulfil this request using the selected models or other properties."
+        logger.error(req.text)
+        emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True)
+        set_aibusy(0)
+        return
+    if(not req.ok):
+        errmsg = f"KoboldAI API Error: Failed to get a standard reply from the Horde. Please check the console."
+        logger.error(req.text)
+        emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True)
+        set_aibusy(0)
+        return
+    try:
+        js = req.json()
+    except requests.exceptions.JSONDecodeError:
+        errmsg = f"Unexpected message received from the Horde: '{req.text}'"
+        logger.error(errmsg)
+        emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True)
+        set_aibusy(0)
+        return
+    gen_servers = [(cgen['server_name'],cgen['server_id']) for cgen in js]
+    logger.info(f"Generations by: {gen_servers}")
+    # Just in case we want to announce it to the user
+    if len(js) == 1:        
+        warnmsg = f"Text generated by {js[0]['server_name']}"
+        emit('from_server', {'cmd': 'warnmsg', 'data': warnmsg}, broadcast=True)
+    genout = [cgen['text'] for cgen in js]
+
+    for i in range(vars.numseqs):
+        vars.lua_koboldbridge.outputs[i+1] = genout[i]
+
+    execute_outmod()
+    if(vars.lua_koboldbridge.regeneration_required):
+        vars.lua_koboldbridge.regeneration_required = False
+        genout = []
+        for i in range(vars.numseqs):
+            genout.append(vars.lua_koboldbridge.outputs[i+1])
+            assert type(genout[-1]) is str
+
+    if(len(genout) == 1):
+        genresult(genout[0])
+    else:
+        adjusted_genout = []
+        for item in genout:
+            adjusted_genout.append({"generated_text": item})
+        # Convert torch output format to transformers
+        seqs = []
+        for seq in adjusted_genout:
+            seqs.append({"generated_text": seq})
+        if(vars.lua_koboldbridge.restart_sequence is not None and vars.lua_koboldbridge.restart_sequence > 0):
+            genresult(adjusted_genout[vars.lua_koboldbridge.restart_sequence-1]["generated_text"])
+        else:
+            genselect(adjusted_genout)
+
+    set_aibusy(0)
+    return
 
 #==================================================================#
 #  Send text to TPU mesh transformer backend
@@ -5033,7 +5323,8 @@ def tpumtjgenerate(txt, minimum, maximum, found_entries=None):
     found_entries = tuple(found_entries.copy() for _ in range(vars.numseqs))
 
     if not vars.quiet:
-        print("{0}Min:{1}, Max:{2}, Txt:{3}{4}".format(colors.YELLOW, minimum, maximum, utils.decodenewlines(tokenizer.decode(txt)), colors.END))
+        logger.debug(f"Prompt Min:{minimum}, Max:{maximum}")
+        logger.prompt(utils.decodenewlines(tokenizer.decode(txt)).encode("unicode_escape").decode("utf-8"))
 
     vars._actions = vars.actions
     vars._prompt = vars.prompt
@@ -5121,9 +5412,8 @@ def tpumtjgenerate(txt, minimum, maximum, found_entries=None):
             vars.lua_running = False
             emit('from_server', {'cmd': 'errmsg', 'data': 'Lua script error; please check console.'}, broadcast=True)
             sendUSStatItems()
-            print("{0}{1}{2}".format(colors.RED, "***LUA ERROR***: ", colors.END), end="", file=sys.stderr)
-            print("{0}{1}{2}".format(colors.RED, str(e).replace("\033", ""), colors.END), file=sys.stderr)
-            print("{0}{1}{2}".format(colors.YELLOW, "Lua engine stopped; please open 'Userscripts' and press Load to reinitialize scripts.", colors.END), file=sys.stderr)
+            logger.debug('LUA ERROR: ' + str(e).replace("\033", ""))
+            logger.warning("Lua engine stopped; please open 'Userscripts' and press Load to reinitialize scripts.")
         else:
             emit('from_server', {'cmd': 'errmsg', 'data': 'Error occurred during generator call; please check console.'}, broadcast=True)
             print("{0}{1}{2}".format(colors.RED, traceback.format_exc().replace("\033", ""), colors.END), file=sys.stderr)
@@ -5319,6 +5609,7 @@ def refresh_settings():
     emit('from_server', {'cmd': 'updatefrmtadsnsp', 'data': vars.formatoptns["frmtadsnsp"]}, broadcast=True)
     emit('from_server', {'cmd': 'updatesingleline', 'data': vars.formatoptns["singleline"]}, broadcast=True)
     emit('from_server', {'cmd': 'updateoutputstreaming', 'data': vars.output_streaming}, broadcast=True)
+    emit('from_server', {'cmd': 'updateshowbudget', 'data': vars.show_budget}, broadcast=True)
     emit('from_server', {'cmd': 'updateshowprobs', 'data': vars.show_probs}, broadcast=True)
     
     # Allow toggle events again
@@ -5407,7 +5698,7 @@ def inlineedit(chunk, data):
             vars.actions_metadata[chunk-1]['Selected Text'] = data
             vars.actions[chunk-1] = data
         else:
-            print(f"WARNING: Attempted to edit non-existent chunk {chunk}")
+            logger.warning(f"Attempted to edit non-existent chunk {chunk}")
 
     setgamesaved(False)
     update_story_chunk(chunk)
@@ -5435,7 +5726,7 @@ def inlinedelete(chunk):
             vars.actions_metadata[chunk-1]['Selected Text'] = ''
             del vars.actions[chunk-1]
         else:
-            print(f"WARNING: Attempted to delete non-existent chunk {chunk}")
+            logger.warning(f"Attempted to delete non-existent chunk {chunk}")
         setgamesaved(False)
         remove_story_chunk(chunk)
         emit('from_server', {'cmd': 'editmode', 'data': 'false'}, broadcast=True)
@@ -5727,14 +6018,14 @@ def checkworldinfo(txt, allowed_entries=None, allowed_folders=None, force_use_tx
                 # Remove leading/trailing spaces if the option is enabled
                 if(vars.wirmvwhtsp):
                     ky = k.strip()
-                if ky in txt:
+                if ky.lower() in txt.lower():
                     if wi.get("selective", False) and len(keys_secondary):
                         found = False
                         for ks in keys_secondary:
                             ksy = ks
                             if(vars.wirmvwhtsp):
                                 ksy = ks.strip()
-                            if ksy in txt:
+                            if ksy.lower() in txt.lower():
                                 wimem = wimem + wi["content"] + "\n"
                                 found_entries.add(id(wi))
                                 found = True
@@ -5866,7 +6157,9 @@ def oairequest(txt, min, max):
     vars.lastctx = txt
     
     # Build request JSON data
-    if 'GooseAI' in args.configname:
+    # GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround
+    # as the vars.model will always be OAI
+    if 'GooseAI' in vars.configname:
         reqdata = {
             'prompt': txt,
             'max_tokens': vars.genamt,
@@ -6696,8 +6989,8 @@ def final_startup():
     threading.Thread(target=__preempt_tokenizer).start()
 
     # Load soft prompt specified by the settings file, if applicable
-    if(path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")):
-        file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r")
+    if(path.exists(get_config_filename())):
+        file = open(get_config_filename(), "r")
         js   = json.load(file)
         if(vars.allowsp and "softprompt" in js and type(js["softprompt"]) is str and all(q not in js["softprompt"] for q in ("..", ":")) and (len(js["softprompt"]) == 0 or all(js["softprompt"][0] not in q for q in ("/", "\\")))):
             spRequest(js["softprompt"])
@@ -7182,6 +7475,7 @@ class GenerationInputSchema(SamplerSettingsSchema):
     singleline: Optional[bool] = fields.Boolean(metadata={"description": "Output formatting option. When enabled, removes everything after the first line of the output, including the newline.\n\nIf `disable_output_formatting` is `true`, this defaults to `false` instead of the value in the KoboldAI GUI."})
     disable_input_formatting: bool = fields.Boolean(load_default=True, metadata={"description": "When enabled, all input formatting options default to `false` instead of the value in the KoboldAI GUI"})
     frmtadsnsp: Optional[bool] = fields.Boolean(metadata={"description": "Input formatting option. When enabled, adds a leading space to your input if there is no trailing whitespace at the end of the previous action.\n\nIf `disable_input_formatting` is `true`, this defaults to `false` instead of the value in the KoboldAI GUI."})
+    quiet: Optional[bool] = fields.Boolean(metadata={"description": "When enabled, Generated output will not be displayed in the console."})
 
 class GenerationResultSchema(KoboldSchema):
     text: str = fields.String(required=True, metadata={"description": "Generated output as plain text."})
@@ -7263,12 +7557,19 @@ class WorldInfoFoldersUIDsSchema(KoboldSchema):
 class WorldInfoUIDsSchema(WorldInfoEntriesUIDsSchema):
     folders: List[WorldInfoFolderSchema] = fields.List(fields.Nested(WorldInfoFolderUIDsSchema), required=True)
 
+class ModelSelectionSchema(KoboldSchema):
+    model: str = fields.String(required=True, validate=validate.Regexp(r"^(?!\s*NeoCustom)(?!\s*GPT2Custom)(?!\s*TPUMeshTransformerGPTJ)(?!\s*TPUMeshTransformerGPTNeoX)(?!\s*GooseAI)(?!\s*OAI)(?!\s*InferKit)(?!\s*Colab)(?!\s*API).*$"), metadata={"description": 'Hugging Face model ID, the path to a model folder (relative to the "models" folder in the KoboldAI root folder) or "ReadOnly" for no model'})
+
 def _generate_text(body: GenerationInputSchema):
     if vars.aibusy or vars.genseqs:
         abort(Response(json.dumps({"detail": {
             "msg": "Server is busy; please try again later.",
             "type": "service_unavailable",
         }}), mimetype="application/json", status=503))
+    # This maps each property of the setting to use when sending the generate idempotently
+    # To the object which typically contains it's value
+    # This allows to set the property only for the API generation, and then revert the setting
+    # To what it was before.
     mapping = {
         "disable_input_formatting": ("vars", "disable_input_formatting", None),
         "disable_output_formatting": ("vars", "disable_output_formatting", None),
@@ -7289,6 +7590,7 @@ def _generate_text(body: GenerationInputSchema):
         "max_length": ("vars", "genamt", None),
         "max_context_length": ("vars", "max_length", None),
         "n": ("vars", "numseqs", None),
+        "quiet": ("vars", "quiet", None),
     }
     saved_settings = {}
     set_aibusy(1)
@@ -7430,7 +7732,7 @@ def post_generate(body: GenerationInputSchema):
             schema: GenerationInputSchema
             example:
               prompt: |-2
-                Explosions of suspicious origin occur at AMNAT satellite-receiver stations from Turkey to Labrador as three high-level Canadian defense ministers vanish and then a couple of days later are photographed at a Volgograd bistro hoisting shots of Stolichnaya with Slavic bimbos on their knee.
+                Niko the kobold stalked carefully down the alley, his small scaly figure obscured by a dusky cloak that fluttered lightly in the cold winter breeze.
               top_p: 0.9
               temperature: 0.5
       responses:
@@ -7442,8 +7744,7 @@ def post_generate(body: GenerationInputSchema):
               example:
                 results:
                   - text: |-2
-                       It is later established that all of the cabinet members have died of old age.
-                      MEGAMATRIX becomes involved in the growing number of mass abductions and kidnappings. Many disappearances occur along highways in western Canada, usually when traffic has come to a standstill because of a stalled truck or snowstorm. One or two abducted individuals will be released within a day or so but never
+                       Holding up his tail to keep it from dragging in the dirty snow that covered the cobblestone, he waited patiently for the butcher to turn his attention from his stall so that he could pilfer his next meal: a tender-looking chicken.
         {api_validation_error_response}
         {api_not_implemented_response}
         {api_server_busy_response}
@@ -7474,6 +7775,49 @@ def get_model():
     return {"result": vars.model}
 
 
+@api_v1.put("/model")
+@api_schema_wrap
+def put_model(body: ModelSelectionSchema):
+    """---
+    put:
+      summary: Load a model
+      description: |-2
+        Loads a model given its Hugging Face model ID, the path to a model folder (relative to the "models" folder in the KoboldAI root folder) or "ReadOnly" for no model.
+      tags:
+        - model
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema: ModelSelectionSchema
+            example:
+              model: ReadOnly
+      responses:
+        200:
+          description: Successful request
+          content:
+            application/json:
+              schema: EmptySchema
+        {api_validation_error_response}
+        {api_server_busy_response}
+    """
+    if vars.aibusy or vars.genseqs:
+        abort(Response(json.dumps({"detail": {
+            "msg": "Server is busy; please try again later.",
+            "type": "service_unavailable",
+        }}), mimetype="application/json", status=503))
+    set_aibusy(1)
+    old_model = vars.model
+    vars.model = body.model.strip()
+    try:
+        load_model(use_breakmodel_args=True, breakmodel_args_default_to_cpu=True)
+    except Exception as e:
+        vars.model = old_model
+        raise e
+    set_aibusy(0)
+    return {}
+
+
 def prompt_validator(prompt: str):
     if len(prompt.strip()) == 0:
         raise ValidationError("String does not match expected pattern.")
@@ -7523,7 +7867,7 @@ def post_story_end(body: SubmissionInputSchema):
     numseqs = vars.numseqs
     vars.numseqs = 1
     try:
-        actionsubmit(body.prompt, force_submit=True, no_generate=True)
+        actionsubmit(body.prompt, force_submit=True, no_generate=True, ignore_aibusy=True)
     finally:
         vars.disable_set_aibusy = disable_set_aibusy
         vars.standalone = _standalone
@@ -9447,6 +9791,32 @@ def get_config_soft_prompt():
     """
     return {"value": vars.spfilename.strip()}
 
+class SoftPromptsListSchema(KoboldSchema):
+    values: List[SoftPromptSettingSchema] = fields.List(fields.Nested(SoftPromptSettingSchema), required=True, metadata={"description": "Array of available softprompts."})
+
+@api_v1.get("/config/soft_prompts_list")
+@api_schema_wrap
+def get_config_soft_prompts_list():
+    """---
+    get:
+      summary: Retrieve all available softprompt filenames
+      tags:
+        - config
+      responses:
+        200:
+          description: Successful request
+          content:
+            application/json:
+              schema: SoftPromptsListSchema
+              example:
+                values: []
+    """
+    splist = []
+    for sp in fileops.getspfiles(vars.modeldim):
+
+        splist.append({"value":sp["filename"]})
+    return {"values": splist}
+
 @api_v1.put("/config/soft_prompt")
 @api_schema_wrap
 def put_config_soft_prompt(body: SoftPromptSettingSchema):
@@ -9682,11 +10052,14 @@ for schema in config_endpoint_schemas:
 #==================================================================#
 #  Final startup commands to launch Flask app
 #==================================================================#
-print("", end="", flush=True)
 if __name__ == "__main__":
-    print("{0}\nStarting webserver...{1}".format(colors.GREEN, colors.END), flush=True)
 
     general_startup()
+    # Start flask & SocketIO
+    logger.init("Flask", status="Starting")
+    Session(app)
+    logger.init_ok("Flask", status="OK")
+    logger.init("Webserver", status="Starting")
     patch_transformers()
     #show_select_model_list()
     if vars.model == "" or vars.model is None:
@@ -9724,38 +10097,45 @@ if __name__ == "__main__":
         if(args.localtunnel or args.ngrok or args.remote):
             with open('cloudflare.log', 'w') as cloudflarelog:
                 cloudflarelog.write("KoboldAI has finished loading and is available at the following link : " + cloudflare)
-                print(format(colors.GREEN) + "KoboldAI has finished loading and is available at the following link : " + cloudflare + format(colors.END))
+                logger.init_ok("Webserver", status="OK")
+                logger.message(f"KoboldAI has finished loading and is available at the following link: {cloudflare}")
         else:
-            print("{0}Webserver has started, you can now connect to this machine at port {1}{2}"
-                  .format(colors.GREEN, port, colors.END))
+            logger.init_ok("Webserver", status="OK")
+            logger.message(f"Webserver has started, you can now connect to this machine at port: {port}")
         vars.serverstarted = True
         socketio.run(app, host='0.0.0.0', port=port)
     else:
         if args.unblock:
             if not args.no_ui:
-                import webbrowser
-                webbrowser.open_new('http://localhost:{0}'.format(port))
-            print("{0}Server started!\nYou may now connect with a browser at http://127.0.0.1:{1}/{2}"
-                  .format(colors.GREEN, port, colors.END))
+                try:
+                    import webbrowser
+                    webbrowser.open_new('http://localhost:{0}'.format(port))
+                except:
+                    pass
+            logger.init_ok("Webserver", status="OK")
+            logger.message(f"Webserver started! You may now connect with a browser at http://127.0.0.1:{port}")
             vars.serverstarted = True
             socketio.run(app, port=port, host='0.0.0.0')
         else:
-            try:
-                from flaskwebgui import FlaskUI
-                vars.serverstarted = True
-                vars.flaskwebgui = True
-                FlaskUI(app, socketio=socketio, start_server="flask-socketio", maximized=True, close_server_on_exit=True).run()
-            except:
-                if not args.no_ui:
+            if not args.no_ui:
+                try:
                     import webbrowser
                     webbrowser.open_new('http://localhost:{0}'.format(port))
-                print("{0}Server started!\nYou may now connect with a browser at http://127.0.0.1:{1}/{2}"
-                        .format(colors.GREEN, port, colors.END))
-                vars.serverstarted = True
-                socketio.run(app, port=port)
+                except:
+                    pass
+            logger.init_ok("Webserver", status="OK")
+            logger.message(f"Webserver started! You may now connect with a browser at http://127.0.0.1:{port}")
+            vars.serverstarted = True
+            socketio.run(app, port=port)
+    logger.init("Webserver", status="Closed")
+
 
 else:
     general_startup()
+    # Start flask & SocketIO
+    logger.init("Flask", status="Starting")
+    Session(app)
+    logger.init_ok("Flask", status="OK")
     patch_transformers()
     #show_select_model_list()
     if vars.model == "" or vars.model is None:
diff --git a/colab/GPU.ipynb b/colab/GPU.ipynb
index f652d3bb..a74ef0cf 100644
--- a/colab/GPU.ipynb
+++ b/colab/GPU.ipynb
@@ -7,7 +7,6 @@
       "private_outputs": true,
       "provenance": [],
       "collapsed_sections": [],
-      "authorship_tag": "ABX9TyPbwW79K9/RkYH9i9rkYFyj",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -68,9 +67,9 @@
         "#@title <b><-- Select your model below and then click this to start KoboldAI</b>\n",
         "#@markdown You can find a description of the models below along with instructions on how to start KoboldAI.\n",
         "\n",
-        "Model = \"Nerys 2.7B\" #@param [\"Nerys 2.7B\", \"Janeway 2.7B\", \"Picard 2.7B\", \"AID 2.7B\", \"Horni LN 2.7B\", \"Horni 2.7B\", \"Shinen 2.7B\", \"Neo 2.7B\"] {allow-input: true}\n",
+        "Model = \"Nerys 2.7B\" #@param [\"Nerys 2.7B\", \"AID 2.7B\", \"Erebus 2.7B\", \"Janeway 2.7B\", \"Picard 2.7B\", \"Horni LN 2.7B\", \"Horni 2.7B\", \"Shinen 2.7B\", \"Neo 2.7B\"] {allow-input: true}\n",
         "Version = \"Official\" #@param [\"Official\", \"United\"] {allow-input: true}\n",
-        "Provider = \"Cloudflare\" #@param [\"Localtunnel\", \"Cloudflare\"]\n",
+        "Provider = \"Localtunnel\" #@param [\"Localtunnel\", \"Cloudflare\"]\n",
         "\n",
         "!nvidia-smi\n",
         "from google.colab import drive\n",
@@ -80,11 +79,15 @@
         "  Model = \"KoboldAI/fairseq-dense-2.7B-Nerys\"\n",
         "  path = \"\"\n",
         "  download = \"\"\n",
+        "elif Model == \"Erebus 2.7B\":\n",
+        "  Model = \"KoboldAI/OPT-2.7B-Erebus\"\n",
+        "  path = \"\"\n",
+        "  download = \"\"\n",
         "elif Model == \"Janeway 2.7B\":\n",
         "  Model = \"KoboldAI/GPT-Neo-2.7B-Janeway\"\n",
         "  path = \"\"\n",
         "  download = \"\"\n",
-         "elif Model == \"Picard 2.7B\":\n",
+        "elif Model == \"Picard 2.7B\":\n",
         "  Model = \"KoboldAI/GPT-Neo-2.7B-Picard\"\n",
         "  path = \"\"\n",
         "  download = \"\"\n",
@@ -156,7 +159,7 @@
         "| Adventure | These models are excellent for people willing to play KoboldAI like a Text Adventure game and are meant to be used with Adventure mode enabled. Even if you wish to use it as a Novel style model you should always have Adventure mode on and set it to story. These models typically have a strong bias towards the use of the word You and without Adventure mode enabled break the story flow and write actions on your behalf. |\n",
         "| Generic   | Generic models are not trained towards anything specific, typically used as a basis for other tasks and models. They can do everything the other models can do, but require much more handholding to work properly. Generic models are an ideal basis for tasks that we have no specific model for, or for experiencing a softprompt in its raw form. |\n",
         "\n",
-        "---\n",        
+        "---\n",
         "# How to start KoboldAI in 7 simple steps\n",
         "Using KoboldAI on Google Colab is easy! Simply follow these steps to get started:\n",
         "1. Mobile phone? Tap the play button below next to \"<--- Tap this if you play on mobile\" to reveal an audio player, play the silent audio to keep the tab alive so Google will not shut you down when your using KoboldAI. If no audio player is revealed your phone browser does not support Google Colab in the mobile view, go to your browser menu and enable Desktop mode before you continue.\n",
@@ -174,4 +177,4 @@
       }
     }
   ]
-}
+}
\ No newline at end of file
diff --git a/colab/TPU.ipynb b/colab/TPU.ipynb
index 22c35e08..d01fedf6 100644
--- a/colab/TPU.ipynb
+++ b/colab/TPU.ipynb
@@ -66,7 +66,7 @@
         "#@title <b><-- Select your model below and then click this to start KoboldAI</b>\n",
         "#@markdown You can find a description of the models below along with instructions on how to start KoboldAI.\n",
         "\n",
-        "Model = \"Nerys 13B V2\" #@param [\"Nerys 13B V2\", \"Janeway 13B\", \"Shinen 13B\", \"Skein 20B\", \"Skein 6B\", \"Janeway 6B\", \"Adventure 6B\", \"Shinen 6B\", \"Lit 6B\", \"NeoX 20B\", \"OPT 13B\", \"Fairseq Dense 13B\", \"GPT-J-6B\"] {allow-input: true}\n",
+        "Model = \"Nerys 13B V2\" #@param [\"Nerys 13B V2\", \"Erebus 13B\", \"Janeway 13B\", \"Shinen 13B\", \"Skein 20B\", \"Erebus 20B\", \"Skein 6B\", \"Janeway 6B\", \"Adventure 6B\", \"Shinen 6B\", \"Lit V2 6B\", \"Lit 6B\", \"NeoX 20B\", \"OPT 13B\", \"Fairseq Dense 13B\", \"GPT-J-6B\"] {allow-input: true}\n",
         "Version = \"Official\" #@param [\"Official\", \"United\"] {allow-input: true}\n",
         "Provider = \"Cloudflare\" #@param [\"Localtunnel\", \"Cloudflare\"]\n",
         "\n",
@@ -86,13 +86,21 @@
         "  path = \"\"\n",
         "  download = \"\"\n",
         "elif Model == \"Nerys 13B V2\":\n",
-        "  Model = \"KoboldAI/fairseq-dense-13B-Nerys-v2\"\n",
+        "  Model = \"KoboldAI/OPT-13B-Nerys-v2\"\n",
+        "  path = \"\"\n",
+        "  download = \"\"\n",
+        "elif Model == \"Erebus 13B\":\n",
+        "  Model = \"KoboldAI/OPT-13B-Erebus\"\n",
         "  path = \"\"\n",
         "  download = \"\"\n",
         "elif Model == \"Shinen 13B\":\n",
         "  Model = \"KoboldAI/fairseq-dense-13B-Shinen\"\n",
         "  path = \"\"\n",
         "  download = \"\"\n",
+        "elif Model == \"Erebus 20B\":\n",
+        "  Model = \"KoboldAI/GPT-NeoX-20B-Erebus\"\n",
+        "  path = \"\"\n",
+        "  download = \"\"\n",
         "elif Model == \"Skein 20B\":\n",
         "  Model = \"KoboldAI/GPT-NeoX-20B-Skein\"\n",
         "  path = \"\"\n",
@@ -113,6 +121,10 @@
         "  Model = \"KoboldAI/GPT-J-6B-Adventure\"\n",
         "  path = \"\"\n",
         "  download = \"\"\n",
+        "elif Model == \"Lit V2 6B\":\n",
+        "  Model = \"hakurei/litv2-6B-rev3\"\n",
+        "  path = \"\"\n",
+        "  download = \"\"\n",
         "elif Model == \"Lit 6B\":\n",
         "  Model = \"hakurei/lit-6B\"\n",
         "  path = \"\"\n",
@@ -200,7 +212,7 @@
       "source": [
         "#@title <b>Model Cleaner</b>\n",
         "#@markdown Out of space? Run this to remove all cached models (Google Drive models are not effected).\n",
-        "!rm /content/KoboldAI-Client/cache/*\n"
+        "!rm -rf /content/KoboldAI-Client/cache/*\n"
       ],
       "metadata": {
         "cellView": "form",
diff --git a/docker-standalone/docker-helper.sh b/docker-standalone/docker-helper.sh
old mode 100644
new mode 100755
diff --git a/environments/finetuneanon.yml b/environments/finetuneanon.yml
index 995bd8fa..85d5ea66 100644
--- a/environments/finetuneanon.yml
+++ b/environments/finetuneanon.yml
@@ -18,6 +18,7 @@ dependencies:
   - git=2.35.1
   - marshmallow>=3.13
   - apispec-webframeworks
+  - loguru
   - pip:
     - git+https://github.com/finetuneanon/transformers@gpt-neo-localattention3-rp-b
     - flask-cloudflared
diff --git a/environments/huggingface.yml b/environments/huggingface.yml
index 2d9ace1d..7abceefa 100644
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -19,6 +19,7 @@ dependencies:
   - protobuf
   - marshmallow>=3.13
   - apispec-webframeworks
+  - loguru
   - pip:
     - flask-cloudflared
     - flask-ngrok
diff --git a/environments/rocm-finetune.yml b/environments/rocm-finetune.yml
index 05d7a2c1..fc56eb4f 100644
--- a/environments/rocm-finetune.yml
+++ b/environments/rocm-finetune.yml
@@ -14,6 +14,7 @@ dependencies:
   - git=2.35.1
   - marshmallow>=3.13
   - apispec-webframeworks
+  - loguru
   - pip:
     - --find-links https://download.pytorch.org/whl/rocm4.2/torch_stable.html
     - torch
diff --git a/environments/rocm.yml b/environments/rocm.yml
index 142d4230..e885f4df 100644
--- a/environments/rocm.yml
+++ b/environments/rocm.yml
@@ -16,9 +16,10 @@ dependencies:
   - protobuf
   - marshmallow>=3.13
   - apispec-webframeworks
+  - loguru
   - pip:
-    - --find-links https://download.pytorch.org/whl/rocm4.2/torch_stable.html
-    - torch==1.10.*
+    - --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
+    - torch
     - torchvision
     - flask-cloudflared
     - flask-ngrok
diff --git a/fileops.py b/fileops.py
index c303764e..a416f24d 100644
--- a/fileops.py
+++ b/fileops.py
@@ -3,6 +3,7 @@ from typing import Tuple, Union, Optional
 import os
 import json
 import zipfile
+from logger import logger
 
 #==================================================================#
 #  Generic Method for prompting for file path
@@ -149,16 +150,16 @@ def getspfiles(model_dimension: int):
             continue
         z, version, shape, fortran_order, dtype = checksp(file, model_dimension)
         if z == 1:
-            print(f"Browser SP loading error: {file} is malformed or not a soft prompt ZIP file.")
+            logger.warning(f"Softprompt {file} is malformed or not a soft prompt ZIP file.")
             continue
         if z == 2:
-            print(f"Browser SP loading error: {file} tensor.npy has unsupported dtype '{dtype.name}'.")
+            logger.warning(f"Softprompt {file} tensor.npy has unsupported dtype '{dtype.name}'.")
             continue
         if z == 3:
-            print(f"Browser SP loading error: {file} tensor.npy has model dimension {shape[1]} which does not match your model's model dimension of {model_dimension}. This usually means this soft prompt is not compatible with your model.")
+            logger.debug(f"Softprompt {file} tensor.npy has model dimension {shape[1]} which does not match your model's model dimension of {model_dimension}. This usually means this soft prompt is not compatible with your model.")
             continue
         if z == 4:
-            print(f"Browser SP loading error: {file} tensor.npy has {shape[0]} tokens but it is supposed to have less than 2048 tokens.")
+            logger.warning(f"Softprompt {file} tensor.npy has {shape[0]} tokens but it is supposed to have less than 2048 tokens.")
             continue
         assert isinstance(z, zipfile.ZipFile)
         try:
diff --git a/gensettings.py b/gensettings.py
index 4824ed27..a823f59b 100644
--- a/gensettings.py
+++ b/gensettings.py
@@ -241,17 +241,6 @@ gensettingstf = [
 	"default": 0,
   "tooltip": "Causes generation to be fully deterministic -- the model will always output the same thing as long as your story, settings and RNG seed are the same. If this is off, only the sequence of outputs that the model makes will be deterministic."
 	},
-    {
-	"uitype": "toggle",
-	"unit": "bool",
-	"label": "Debug",
-	"id": "debug",
-	"min": 0,
-	"max": 1,
-	"step": 1,
-	"default": 0,
-  "tooltip": "Show debug info"
-	},
     {
 	"uitype": "toggle",
 	"unit": "bool",
@@ -285,6 +274,17 @@ gensettingstf = [
 	"default": 0,
   "tooltip": "Shows token usage when typing in relevant text boxes. <b>May lag slower devices.</b>"
 	},
+    {
+	"uitype": "toggle",
+	"unit": "bool",
+	"label": "Debug",
+	"id": "debug",
+	"min": 0,
+	"max": 1,
+	"step": 1,
+	"default": 0,
+  "tooltip": "Show debug info"
+	},
 ]
 
 gensettingsik =[{
diff --git a/logger.py b/logger.py
new file mode 100644
index 00000000..8da2aa7e
--- /dev/null
+++ b/logger.py
@@ -0,0 +1,99 @@
+import sys
+from functools import partialmethod
+from loguru import logger
+
+STDOUT_LEVELS = ["GENERATION", "PROMPT"]
+INIT_LEVELS = ["INIT", "INIT_OK", "INIT_WARN", "INIT_ERR"]
+MESSAGE_LEVELS = ["MESSAGE"]
+# By default we're at error level or higher
+verbosity = 20
+quiet = 0
+
+def set_logger_verbosity(count):
+    global verbosity
+    # The count comes reversed. So count = 0 means minimum verbosity
+    # While count 5 means maximum verbosity
+    # So the more count we have, the lowe we drop the versbosity maximum
+    verbosity = 20 - (count * 10)
+
+def quiesce_logger(count):
+    global quiet
+    # The bigger the count, the more silent we want our logger
+    quiet = count * 10
+
+def is_stdout_log(record):
+    if record["level"].name not in STDOUT_LEVELS:
+        return(False)
+    if record["level"].no < verbosity + quiet:
+        return(False)
+    return(True)
+
+def is_init_log(record):
+    if record["level"].name not in INIT_LEVELS:
+        return(False)
+    if record["level"].no < verbosity + quiet:
+        return(False)
+    return(True)
+
+def is_msg_log(record):
+    if record["level"].name not in MESSAGE_LEVELS:
+        return(False)
+    if record["level"].no < verbosity + quiet:
+        return(False)
+    return(True)
+
+def is_stderr_log(record):
+    if record["level"].name in STDOUT_LEVELS + INIT_LEVELS + MESSAGE_LEVELS:
+        return(False)
+    if record["level"].no < verbosity + quiet:
+        return(False)
+    return(True)
+
+def test_logger():
+    logger.generation("This is a generation message\nIt is typically multiline\nThee Lines".encode("unicode_escape").decode("utf-8"))
+    logger.prompt("This is a prompt message")
+    logger.debug("Debug Message")
+    logger.info("Info Message")
+    logger.warning("Info Warning")
+    logger.error("Error Message")
+    logger.critical("Critical Message")
+    logger.init("This is an init message", status="Starting")
+    logger.init_ok("This is an init message", status="OK")
+    logger.init_warn("This is an init message", status="Warning")
+    logger.init_err("This is an init message", status="Error")
+    logger.message("This is user message")
+    sys.exit()
+
+
+logfmt = "<level>{level: <10}</level> | <green>{name}</green>:<green>{function}</green>:<green>{line}</green> - <level>{message}</level>"
+genfmt = "<level>{level: <10}</level> @ <green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{message}</level>"
+initfmt = "<magenta>INIT      </magenta> | <level>{extra[status]: <10}</level> | <magenta>{message}</magenta>"
+msgfmt = "<level>{level: <10}</level> | <level>{message}</level>"
+
+logger.level("GENERATION", no=24, color="<cyan>")
+logger.level("PROMPT", no=23, color="<yellow>")
+logger.level("INIT", no=31, color="<white>")
+logger.level("INIT_OK", no=31, color="<green>")
+logger.level("INIT_WARN", no=31, color="<yellow>")
+logger.level("INIT_ERR", no=31, color="<red>")
+# Messages contain important information without which this application might not be able to be used
+# As such, they have the highest priority
+logger.level("MESSAGE", no=61, color="<green>")
+
+logger.__class__.generation = partialmethod(logger.__class__.log, "GENERATION")
+logger.__class__.prompt = partialmethod(logger.__class__.log, "PROMPT")
+logger.__class__.init = partialmethod(logger.__class__.log, "INIT")
+logger.__class__.init_ok = partialmethod(logger.__class__.log, "INIT_OK")
+logger.__class__.init_warn = partialmethod(logger.__class__.log, "INIT_WARN")
+logger.__class__.init_err = partialmethod(logger.__class__.log, "INIT_ERR")
+logger.__class__.message = partialmethod(logger.__class__.log, "MESSAGE")
+
+config = {
+    "handlers": [
+        {"sink": sys.stderr, "format": logfmt, "colorize":True, "filter": is_stderr_log},
+        {"sink": sys.stdout, "format": genfmt, "level": "PROMPT", "colorize":True, "filter": is_stdout_log},
+        {"sink": sys.stdout, "format": initfmt, "level": "INIT", "colorize":True, "filter": is_init_log},
+        {"sink": sys.stdout, "format": msgfmt, "level": "MESSAGE", "colorize":True, "filter": is_msg_log}
+    ],
+}
+logger.configure(**config)
diff --git a/readme.md b/readme.md
index 43b43046..413242d5 100644
--- a/readme.md
+++ b/readme.md
@@ -50,30 +50,35 @@ Each edition features different models and requires different hardware to run, t
 
 ## [TPU Edition Model Descriptions](https://colab.research.google.com/github/KoboldAI/KoboldAI-Client/blob/main/colab/TPU.ipynb)
 
-| Model | Size | Style | Description |
-| --- | --- | --- | --- |
-| [Nerys](https://huggingface.co/KoboldAI/fairseq-dense-13B-Nerys) by Mr Seeker | 13B | Novel/Adventure | Nerys is a hybrid model based on Pike (A newer Janeway), on top of the Pike dataset you also get some Light Novels, Adventure mode support and a little bit of Shinen thrown in the mix. The end result is a very diverse model that is heavily biased towards SFW novel writing, but one that can go beyond its novel training and make for an excellent adventure model to. Adventure mode is best played from a second person perspective, but can be played in first or third person as well. Novel writing can be done best from the first or third person. |
-| [Janeway](https://huggingface.co/KoboldAI/fairseq-dense-13B-Janeway) by Mr Seeker | 13B | Novel | Janeway is a model created from Picard's dataset combined with a brand new collection of ebooks. This model is trained on 20% more content than Picard and has been trained on literature from various genres. Although the model is mainly focussed on SFW, romantic scenes might involve a degree of nudity. |
-| [Shinen](https://huggingface.co/KoboldAI/fairseq-dense-13B-Shinen) by Mr Seeker | 13B | NSFW | Shinen is an NSFW model designed to be more explicit. Trained on a variety of stories from the website Sexstories it contains many different kinks. |
-| [Skein](https://huggingface.co/KoboldAI/GPT-J-6B-Skein) by VE\_FORBRYDERNE | 6B | Adventure | Skein is best used with Adventure mode enabled, it consists of a 4 times larger adventure dataset than the Adventure model making it excellent for text adventure gaming. On top of that it also consists of light novel training further expanding its knowledge and writing capabilities. It can be used with the You filter bias if you wish to write Novels with it, but dedicated Novel models can perform better for this task. |
-| [Adventure](https://huggingface.co/KoboldAI/GPT-J-6B-Adventure) by VE\_FORBRYDERNE | 6B | Adventure | Adventure is a 6B model designed to mimick the behavior of AI Dungeon. It is exclusively for Adventure Mode and can take you on the epic and wackey adventures that AI Dungeon players love. It also features the many tropes of AI Dungeon as it has been trained on very similar data. It must be used in second person (You). |
-| [Lit](https://huggingface.co/hakurei/lit-6B) by Haru | 6B | NSFW | Lit is a great NSFW model trained by Haru on both a large set of Literotica stories and high quality novels along with tagging support. Creating a high quality model for your NSFW stories. This model is exclusively a novel model and is best used in third person. |
-| Neo(X) by EleutherAI | 20B | Generic | NeoX is the largest EleutherAI model currently available, being a generic model it is not particularly trained towards anything and can do a variety of writing, Q&A and coding tasks. 20B's performance is closely compared to the 13B models and it is worth trying both especially if you have a task that does not involve english writing. Its behavior will be similar to the GPT-J-6B model since they are trained on the same dataset but with more sensitivity towards repetition penalty and with more knowledge. |
-| [Fairseq Dense](https://huggingface.co/KoboldAI/fairseq-dense-13B) | 13B | Generic | Trained by Facebook Researchers this model stems from the MOE research project within Fairseq. This particular version has been converted by us for use in KoboldAI. It is known to be on par with the larger 20B model from EleutherAI and considered as better for pop culture and language tasks. Because the model has never seen a new line (enter) it may perform worse on formatting and paragraphing. |
-| [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6B) by EleutherAI | 6B | Generic | This model serves as the basis for most other 6B models (Some being based on Fairseq Dense instead). Being trained on the Pile and not biased towards anything in particular it is suitable for a variety of tasks such as writing, Q&A and coding tasks. You will likely get better result with larger generic models or finetuned models. |
+| Model | Style | Description |
+| --- | --- | --- |
+| [Nerys](https://huggingface.co/KoboldAI/fairseq-dense-13B-Nerys) by Mr Seeker | Novel/Adventure | Nerys is a hybrid model based on Pike (A newer Janeway), on top of the Pike dataset you also get some Light Novels, Adventure mode support and a little bit of Shinen thrown in the mix. The end result is a very diverse model that is heavily biased towards SFW novel writing, but one that can go beyond its novel training and make for an excellent adventure model to. Adventure mode is best played from a second person perspective, but can be played in first or third person as well. Novel writing can be done best from the first or third person. |
+| [Erebus](https://huggingface.co/KoboldAI/OPT-13B-Erebus) by Mr Seeker | NSFW | Erebus is our community's flagship NSFW model, being a combination of multiple large datasets that include Literotica, Shinen and erotic novels from Nerys and featuring thourough tagging support it covers the vast majority of erotic writing styles. This model is capable of replacing both the Lit and Shinen models in terms of content and style and has been well received as (one of) the best NSFW models out there. If you wish to use this model for commercial or non research usage we recommend choosing the 20B version as that one is not subject to the restrictive OPT license. |
+| [Janeway](https://huggingface.co/KoboldAI/fairseq-dense-13B-Janeway) by Mr Seeker | Novel | Janeway is a model created from Picard's dataset combined with a brand new collection of ebooks. This model is trained on 20% more content than Picard and has been trained on literature from various genres. Although the model is mainly focussed on SFW, romantic scenes might involve a degree of nudity. |
+| [Shinen](https://huggingface.co/KoboldAI/fairseq-dense-13B-Shinen) by Mr Seeker | NSFW | Shinen is an NSFW model trained on a variety of stories from the website Sexstories it contains many different kinks. It has been merged into the larger (and better) Erebus model. |
+| [Skein](https://huggingface.co/KoboldAI/GPT-J-6B-Skein) by VE\_FORBRYDERNE | Adventure | Skein is best used with Adventure mode enabled, it consists of a 4 times larger adventure dataset than the Adventure model making it excellent for text adventure gaming. On top of that it also consists of light novel training further expanding its knowledge and writing capabilities. It can be used with the You filter bias if you wish to write Novels with it, but dedicated Novel models can perform better for this task. |
+| [Adventure](https://huggingface.co/KoboldAI/GPT-J-6B-Adventure) by VE\_FORBRYDERNE | Adventure | Adventure is a 6B model designed to mimick the behavior of AI Dungeon. It is exclusively for Adventure Mode and can take you on the epic and wackey adventures that AI Dungeon players love. It also features the many tropes of AI Dungeon as it has been trained on very similar data. It must be used in second person (You). |
+| [Lit](https://huggingface.co/hakurei/lit-6B) ([V2](https://huggingface.co/hakurei/litv2-6B-rev3)) by Haru | NSFW | Lit is a great NSFW model trained by Haru on both a large set of Literotica stories and high quality novels along with tagging support. Creating a high quality model for your NSFW stories. This model is exclusively a novel model and is best used in third person. |
+| [OPT](https://huggingface.co/facebook/opt-13b) by Metaseq | Generic | OPT is considered one of the best base models as far as content goes, its behavior has the strengths of both GPT-Neo and Fairseq Dense. Compared to Neo duplicate and unnecessary content has been left out, while additional literature was added in similar to the Fairseq Dense model. The Fairseq Dense model however lacks the broader data that OPT does have. The biggest downfall of OPT is its license, which prohibits any commercial usage, or usage beyond research purposes. |
+| [Neo(X)](https://huggingface.co/EleutherAI/gpt-neox-20b) by EleutherAI | Generic | NeoX is the largest EleutherAI model currently available, being a generic model it is not particularly trained towards anything and can do a variety of writing, Q&A and coding tasks. 20B's performance is closely compared to the 13B models and it is worth trying both especially if you have a task that does not involve english writing. Its behavior will be similar to the GPT-J-6B model since they are trained on the same dataset but with more sensitivity towards repetition penalty and with more knowledge. |
+| [Fairseq Dense](https://huggingface.co/KoboldAI/fairseq-dense-13B) | Generic | Trained by Facebook Researchers this model stems from the MOE research project within Fairseq. This particular version has been converted by us for use in KoboldAI. It is known to be on par with the larger 20B model from EleutherAI and considered as better for pop culture and language tasks. Because the model has never seen a new line (enter) it may perform worse on formatting and paragraphing. Compared to other models the dataset focuses primarily on literature and contains little else. |
+| [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6B) by EleutherAI | Generic | This model serves as the basis for most other 6B models (Some being based on Fairseq Dense instead). Being trained on the Pile and not biased towards anything in particular it is suitable for a variety of tasks such as writing, Q&A and coding tasks. You will likely get better result with larger generic models or finetuned models. |
 
 ## [GPU Edition Model Descriptions](https://colab.research.google.com/github/KoboldAI/KoboldAI-Client/blob/main/colab/GPU.ipynb)
 
-| Model | Size | Style | Description |
-| --- | --- | --- | --- |
-| [Nerys 2.7B](https://huggingface.co/KoboldAI/fairseq-dense-2.7B-Nerys) by Mr Seeker | 2.7B | Novel/Adventure | Nerys is a hybrid model based on Pike (A newer Janeway), on top of the Pike dataset you also get some Light Novels, Adventure mode support and a little bit of Shinen thrown in the mix. The end result is a very diverse model that is heavily biased towards SFW novel writing, but one that can go beyond its novel training and make for an excellent adventure model to. Adventure mode is best played from a second person perspective, but can be played in first or third person as well. Novel writing can be done best from the first or third person. |
-| [Janeway 2.7B](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-Janeway) by Mr Seeker | 2.7B | Novel | Janeway is a model created from Picard's dataset combined with a brand new collection of ebooks. This model is trained on 20% more content than Picard and has been trained on literature from various genres. Although the model is mainly focussed on SFW, romantic scenes might involve a degree of nudity. |
-| [Picard 2.7B](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-Picard) by Mr Seeker | 2.7B | Novel | Picard is a model trained for SFW Novels based on Neo 2.7B. It is focused on Novel style writing without the NSFW bias. While the name suggests a sci-fi model this model is designed for Novels of a variety of genre's. It is meant to be used in KoboldAI's regular mode. |
-| [AID 2.7B](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-AID) by melastacho | 2.7B | Adventure | Also know as Adventure 2.7B this is a clone of the AI Dungeon Classic model and is best known for the epic wackey adventures that AI Dungeon Classic players love. |
-| [Horni LN 2.7B](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-Horni-LN) by finetune | 2.7B | Novel | This model is based on Horni 2.7B and retains its NSFW knowledge, but was then further biased towards SFW novel stories. If you seek a balance between a SFW Novel model and a NSFW model this model should be a good choice. |
-| [Horni 2.7B](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-Horni) by finetune | 2.7B | NSFW | This model is tuned on Literotica to produce a Novel style model biased towards NSFW content. Can still be used for SFW stories but will have a bias towards NSFW content. It is meant to be used in KoboldAI's regular mode. |
-| [Shinen 2.7B ](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-Shinen) by Mr Seeker | 2.7B | NSFW | Shinen is an alternative to the Horni model designed to be more explicit. If Horni is to tame for you Shinen might produce better results. While it is a Novel model it is unsuitable for SFW stories due to its heavy NSFW bias. Shinen will not hold back. It is meant to be used in KoboldAI's regular mode. |
-| [Neo 2.7B](https://huggingface.co/EleutherAI/gpt-neo-2.7B) by EleutherAI | 2.7B | Generic | This is the base model for all the other 2.7B models, it is best used when you have a use case that we have no other models available for, such as writing blog articles or programming. It can also be a good basis for the experience of some of the softprompts if your softprompt is not about a subject the other models cover. |
+| Model | Style | Description |
+| --- | --- | --- |
+| [Nerys](https://huggingface.co/KoboldAI/fairseq-dense-2.7B-Nerys) by Mr Seeker | Novel/Adventure | Nerys is a hybrid model based on Pike (A newer Janeway), on top of the Pike dataset you also get some Light Novels, Adventure mode support and a little bit of Shinen thrown in the mix. The end result is a very diverse model that is heavily biased towards SFW novel writing, but one that can go beyond its novel training and make for an excellent adventure model to. Adventure mode is best played from a second person perspective, but can be played in first or third person as well. Novel writing can be done best from the first or third person. |
+| [Erebus](https://huggingface.co/KoboldAI/OPT-2.7B-Erebus) by Mr Seeker | NSFW | Erebus is our community's flagship NSFW model, being a combination of multiple large datasets that include Literotica, Shinen and erotic novels from Nerys and featuring thourough tagging support it covers the vast majority of erotic writing styles. This model is capable of replacing both the Lit and Shinen models in terms of content and style and has been well received as (one of) the best NSFW models out there. If you wish to use this model for commercial or non research usage we recommend choosing the 20B version as that one is not subject to the restrictive OPT license. |
+| [Janeway](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-Janeway) by Mr Seeker | Novel | Janeway is a model created from Picard's dataset combined with a brand new collection of ebooks. This model is trained on 20% more content than Picard and has been trained on literature from various genres. Although the model is mainly focussed on SFW, romantic scenes might involve a degree of nudity. |
+| [Picard](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-Picard) by Mr Seeker | Novel | Picard is a model trained for SFW Novels based on Neo 2.7B. It is focused on Novel style writing without the NSFW bias. While the name suggests a sci-fi model this model is designed for Novels of a variety of genre's. It is meant to be used in KoboldAI's regular mode. |
+| [AID](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-AID) by melastacho | Adventure | Also know as Adventure 2.7B this is a clone of the AI Dungeon Classic model and is best known for the epic wackey adventures that AI Dungeon Classic players love. |
+| [Horni LN](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-Horni-LN) by finetune | Novel | This model is based on Horni 2.7B and retains its NSFW knowledge, but was then further biased towards SFW novel stories. If you seek a balance between a SFW Novel model and a NSFW model this model should be a good choice. |
+| [Horni](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-Horni) by finetune | NSFW | This model is tuned on Literotica to produce a Novel style model biased towards NSFW content. Can still be used for SFW stories but will have a bias towards NSFW content. It is meant to be used in KoboldAI's regular mode. |
+| [Shinen](https://huggingface.co/KoboldAI/GPT-Neo-2.7B-Shinen) by Mr Seeker | NSFW | Shinen is an alternative to the Horni model designed to be more explicit. If Horni is to tame for you Shinen might produce better results. While it is a Novel model it is unsuitable for SFW stories due to its heavy NSFW bias. Shinen will not hold back. It is meant to be used in KoboldAI's regular mode. |
+| [OPT](https://huggingface.co/facebook/opt-2.7b) by Metaseq | Generic | OPT is considered one of the best base models as far as content goes, its behavior has the strengths of both GPT-Neo and Fairseq Dense. Compared to Neo duplicate and unnecessary content has been left out, while additional literature was added in similar to the Fairseq Dense model. The Fairseq Dense model however lacks the broader data that OPT does have. The biggest downfall of OPT is its license, which prohibits any commercial usage, or usage beyond research purposes. |
+| [Fairseq Dense](https://huggingface.co/KoboldAI/fairseq-dense-2.7B) | Generic | Trained by Facebook Researchers this model stems from the MOE research project within Fairseq. This particular version has been converted by us for use in KoboldAI. It is known to be on par with the larger models from EleutherAI and considered as better for pop culture and language tasks. Because the model has never seen a new line (enter) it may perform worse on formatting and paragraphing. Compared to other models the dataset focuses primarily on literature and contains little else. |
+| [Neo](https://huggingface.co/EleutherAI/gpt-neo-2.7B) by EleutherAI | Generic | This is the base model for all the other 2.7B models, it is best used when you have a use case that we have no other models available for, such as writing blog articles or programming. It can also be a good basis for the experience of some of the softprompts if your softprompt is not about a subject the other models cover. |
 
 ### Styles
 
@@ -192,14 +197,21 @@ Lastly the all the features of our userscript API are documented inside the API
 
 For our TPU versions keep in mind that scripts modifying AI behavior relies on a different way of processing that is slower than if you leave these userscripts disabled even if your script only sporadically uses this modifier. If you want to partially use a script at its full speed than you can enable "No Gen Modifiers" to ensure that the parts that would make the TPU slow are not active.
 
+## API
+
+KoboldAI has a REST API that can be accessed by adding /api to the URL that Kobold provides you (For example http://127.0.0.1:5000/api).  
+When accessing this link in a browser you will be taken to the interactive documentation.
+
 ## Contributors
 
 This project contains work from the following contributors :
 
 *   The Gantian - Creator of KoboldAI, has created most features such as the interface, the different AI model / API integrations and in general the largest part of the project.
-*   VE FORBRYDERNE - Contributed many features such as the Editing overhaul, Adventure Mode, expansions to the world info section, breakmodel integration, scripting support, softpromtps and much more. As well as vastly improving the TPU compatibility and integrating external code into KoboldAI so we could use official versions of Transformers with virtually no downsides.
+*   VE FORBRYDERNE - Contributed many features such as the Editing overhaul, Adventure Mode, expansions to the world info section, breakmodel integration, scripting support, API, softpromtps and much more. As well as vastly improving the TPU compatibility and integrating external code into KoboldAI so we could use official versions of Transformers with virtually no downsides.
 *   Henk717 - Contributed the installation scripts, this readme, random story generator, the docker scripts, the foundation for the commandline interface and other smaller changes as well as integrating multiple parts of the code of different forks to unite it all. He also optimized the model loading so that downloaded models get converted to efficient offline models and that in future models are more likely to work out of the box. Not all code Github attributes to Henk717 is by Henk717 as some of it has been integrations of other people's work. We try to clarify this in the contributors list as much as we can.
 *   Ebolam - Automatic Saving, back/redo, pinning, web loading of models
+*   one-some, Logits Viewer and Token Streaming
+*   db0, KoboldAI Horde
 *   Frogging101 - top\_k / tfs support (Part of this support was later redone by VE to integrate what was originally inside of finetuneanon's transformers)
 *   UWUplus (Ralf) - Contributed storage systems for community colabs, as well as cleaning up and integrating the website dependencies/code better. He is also the maintainer of flask-cloudflared which we use to generate the cloudflare links.
 *   Javalar - Initial Performance increases on the story\_refresh
@@ -216,4 +228,4 @@ Did we miss your contribution? Feel free to issue a commit adding your name to t
 
 KoboldAI is licensed with a AGPL license, in short this means that it can be used by anyone for any purpose. However, if you decide to make a publicly available instance your users are entitled to a copy of the source code including all modifications that you have made (which needs to be available trough an interface such as a button on your website), you may also not distribute this project in a form that does not contain the source code (Such as compiling / encrypting the code and distributing this version without also distributing the source code that includes the changes that you made. You are allowed to distribute this in a closed form if you also provide a separate archive with the source code.).
 
-umamba.exe is bundled for convenience because we observed that many of our users had trouble with command line download methods, it is not part of our project and does not fall under the AGPL license. It is licensed under the BSD-3-Clause license. Other files with differing licenses will have a reference or embedded version of this license within the file.
+umamba.exe is bundled for convenience because we observed that many of our users had trouble with command line download methods, it is not part of our project and does not fall under the AGPL license. It is licensed under the BSD-3-Clause license. Other files with differing licenses will have a reference or embedded version of this license within the file. It has been sourced from https://anaconda.org/conda-forge/micromamba/files and its source code can be found here : https://github.com/mamba-org/mamba/tree/master/micromamba
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 5dd27bce..b1e2247c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ transformers>=4.20.1
 Flask
 Flask-SocketIO
 requests
-torch==1.11
+torch >= 1.9, < 1.13
 flask-cloudflared
 flask-ngrok
 eventlet
@@ -15,3 +15,4 @@ accelerate
 flask-session
 marshmallow>=3.13
 apispec-webframeworks
+loguru
diff --git a/requirements_mtj.txt b/requirements_mtj.txt
index 67ff0e25..743c9c1d 100644
--- a/requirements_mtj.txt
+++ b/requirements_mtj.txt
@@ -1,12 +1,11 @@
-torch >= 1.9, <= 1.11
+torch >= 1.9, < 1.13
 numpy
 tqdm
 requests
-optax >= 0.0.5, <= 0.0.9
 dm-haiku == 0.0.5
 jax == 0.2.21
 jaxlib >= 0.1.69, <= 0.3.7
-transformers >= 4.19
+transformers >= 4.20.1
 progressbar2
 git+https://github.com/VE-FORBRYDERNE/mesh-transformer-jax@ck
 flask
@@ -20,3 +19,4 @@ bleach==4.1.0
 flask-session
 marshmallow>=3.13
 apispec-webframeworks
+loguru
diff --git a/static/application.js b/static/application.js
index 9107e161..6cdb531b 100644
--- a/static/application.js
+++ b/static/application.js
@@ -107,6 +107,9 @@ var modelname = null;
 var model = "";
 var ignore_stream = false;
 
+//timer for loading CLUSTER models
+var online_model_timmer;
+
 // This is true iff [we're in macOS and the browser is Safari] or [we're in iOS]
 var using_webkit_patch = true;
 
@@ -1003,7 +1006,7 @@ function hideSaveAsPopup() {
 }
 
 function sendSaveAsRequest() {
-	socket.send({'cmd': 'saveasrequest', 'data': {"name": saveasinput.val(), "pins": savepins.val()}});
+	socket.send({'cmd': 'saveasrequest', 'data': {"name": saveasinput.val(), "pins": savepins.prop('checked')}});
 }
 
 function showLoadModelPopup() {
@@ -1643,26 +1646,29 @@ function chunkOnBeforeInput(event) {
 	if(buildChunkSetFromNodeArray(getSelectedNodes()).size === 0) {
 		var s = rangy.getSelection();
 		var r = s.getRangeAt(0);
+		var rand = Math.random();
 		if(document.queryCommandSupported && document.execCommand && document.queryCommandSupported('insertHTML')) {
-			document.execCommand('insertHTML', false, '<span id="_EDITOR_SENTINEL_">|</span>');
+			document.execCommand('insertHTML', false, '<span id="_EDITOR_SENTINEL_' + rand + '_">|</span>');
 		} else {
 			var t = document.createTextNode('|');
 			var b = document.createElement('span');
-			b.id = "_EDITOR_SENTINEL_";
+			b.id = "_EDITOR_SENTINEL_" + rand + "_";
 			b.insertNode(t);
 			r.insertNode(b);
 		}
-		var sentinel = document.getElementById("_EDITOR_SENTINEL_");
-		if(sentinel.nextSibling && sentinel.nextSibling.tagName === "CHUNK") {
-			r.selectNodeContents(sentinel.nextSibling);
-			r.collapse(true);
-		} else if(sentinel.previousSibling && sentinel.previousSibling.tagName === "CHUNK") {
-			r.selectNodeContents(sentinel.previousSibling);
-			r.collapse(false);
-		}
-		s.removeAllRanges();
-		s.addRange(r);
-		sentinel.parentNode.removeChild(sentinel);
+		setTimeout(function() {
+			var sentinel = document.getElementById("_EDITOR_SENTINEL_" + rand + "_");
+			if(sentinel.nextSibling && sentinel.nextSibling.tagName === "CHUNK") {
+				r.selectNodeContents(sentinel.nextSibling);
+				r.collapse(true);
+			} else if(sentinel.previousSibling && sentinel.previousSibling.tagName === "CHUNK") {
+				r.selectNodeContents(sentinel.previousSibling);
+				r.collapse(false);
+			}
+			s.removeAllRanges();
+			s.addRange(r);
+			sentinel.parentNode.removeChild(sentinel);
+		}, 1);
 	}
 }
 
@@ -2708,6 +2714,9 @@ $(document).ready(function(){
 		} else if(msg.cmd == "updateoutputstreaming") {
 			// Update toggle state
 			$("#setoutputstreaming").prop('checked', msg.data).change();
+		} else if(msg.cmd == "updateshowbudget") {
+			// Update toggle state
+			$("#setshowbudget").prop('checked', msg.data).change();
 		} else if(msg.cmd == "updateshowprobs") {
 			$("#setshowprobs").prop('checked', msg.data).change();
 
@@ -2847,17 +2856,17 @@ $(document).ready(function(){
 			chat_name.val(msg.data);
 		} else if(msg.cmd == "setlabelnumseq") {
 			// Update setting label with value from server
-			$("#setnumseqcur").html(msg.data);
+			$("#setnumseqcur").val(msg.data);
 		} else if(msg.cmd == "updatenumseq") {
 			// Send current max tokens value to input
-			$("#setnumseqcur").html(msg.data);
+			$("#setnumseqcur").val(msg.data);
 			$("#setnumseq").val(parseInt(msg.data)).trigger("change");
 		} else if(msg.cmd == "setlabelwidepth") {
 			// Update setting label with value from server
-			$("#setwidepthcur").html(msg.data);
+			$("#setwidepthcur").val(msg.data);
 		} else if(msg.cmd == "updatewidepth") {
 			// Send current max tokens value to input
-			$("#setwidepthcur").html(msg.data);
+			$("#setwidepthcur").val(msg.data);
 			$("#setwidepth").val(parseInt(msg.data)).trigger("change");
 		} else if(msg.cmd == "updateuseprompt") {
 			// Update toggle state
@@ -2912,20 +2921,48 @@ $(document).ready(function(){
 			$("#oaimodel").addClass("hidden")
 			buildLoadModelList(msg.data, msg.menu, msg.breadcrumbs, msg.showdelete);
 		} else if(msg.cmd == 'selected_model_info') {
+			console.log(msg);
 			enableButtons([load_model_accept]);
 			$("#oaimodel").addClass("hidden")
 			$("#oaimodel")[0].options[0].selected = true;
 			if (msg.key) {
 				$("#modelkey").removeClass("hidden");
 				$("#modelkey")[0].value = msg.key_value;
+				if (msg.models_on_url) {
+					$("#modelkey")[0].oninput = function() {clearTimeout(online_model_timmer);
+																online_model_timmer = setTimeout(function() {
+																	socket.send({'cmd': 'Cluster_Key_Update', 'key': document.getElementById("modelkey").value, 
+																											  'url': document.getElementById("modelurl").value});
+																}, 1000);
+															}
+					$("#modelkey")[0].onblur = function () {socket.send({'cmd': 'Cluster_Key_Update', 'key': this.value, 'url': document.getElementById("modelurl").value});};
+					$("#modelurl")[0].onblur = function () {socket.send({'cmd': 'Cluster_Key_Update', 'key': document.getElementById("modelkey").value, 'url': this.value});};
+				} else {
+					$("#modelkey")[0].onblur = function () {socket.send({'cmd': 'OAI_Key_Update', 'key': $('#modelkey')[0].value});};
+					$("#modelurl")[0].onblur = null;
+				}
 				//if we're in the API list, disable to load button until the model is selected (after the API Key is entered)
 				disableButtons([load_model_accept]);
 			} else {
 				$("#modelkey").addClass("hidden");
-				
 			}
+			
+			console.log(msg.multi_online_models);
+			if (msg.multi_online_models) {
+				$("#oaimodel")[0].setAttribute("multiple", "");
+				$("#oaimodel")[0].options[0].textContent = "All"
+			} else {
+				$("#oaimodel")[0].removeAttribute("multiple");
+				$("#oaimodel")[0].options[0].textContent = "Select Model(s)"
+			}
+			
+			
+			
 			if (msg.url) {
 				$("#modelurl").removeClass("hidden");
+				if (msg.default_url != null) {
+					document.getElementById("modelurl").value = msg.default_url;
+				}
 			} else {
 				$("#modelurl").addClass("hidden");
 			}
@@ -3286,7 +3323,11 @@ $(document).ready(function(){
 			}
 		}
 		var disk_layers = $("#disk_layers").length > 0 ? $("#disk_layers")[0].value : 0;
-		message = {'cmd': 'load_model', 'use_gpu': $('#use_gpu')[0].checked, 'key': $('#modelkey')[0].value, 'gpu_layers': gpu_layers.slice(0, -1), 'disk_layers': disk_layers, 'url': $('#modelurl')[0].value, 'online_model': $('#oaimodel')[0].value};
+		models = getSelectedOptions(document.getElementById('oaimodel'));
+		if (models.length == 1) {
+			models = models[0];
+		}
+		message = {'cmd': 'load_model', 'use_gpu': $('#use_gpu')[0].checked, 'key': $('#modelkey')[0].value, 'gpu_layers': gpu_layers.slice(0, -1), 'disk_layers': disk_layers, 'url': $('#modelurl')[0].value, 'online_model': models};
 		socket.send(message);
 		loadmodelcontent.html("");
 		hideLoadModelPopup();
@@ -3732,3 +3773,27 @@ function upload_file(file_box) {
 	}
 }
 
+function getSelectedOptions(element) {
+    // validate element
+    if(!element || !element.options)
+        return []; //or null?
+
+    // return HTML5 implementation of selectedOptions instead.
+    if (element.selectedOptions) {
+        selectedOptions = element.selectedOptions;
+	} else {
+		// you are here because your browser doesn't have the HTML5 selectedOptions
+		var opts = element.options;
+		var selectedOptions = [];
+		for(var i = 0; i < opts.length; i++) {
+			 if(opts[i].selected) {
+				 selectedOptions.push(opts[i]);
+			 }
+		}
+	}
+	output = []
+	for (item of selectedOptions) {
+		output.push(item.value);
+	}
+    return output;
+}
\ No newline at end of file
diff --git a/static/flask_web_gui.js b/static/flask_web_gui.js
deleted file mode 100644
index 8571d84f..00000000
--- a/static/flask_web_gui.js
+++ /dev/null
@@ -1,20 +0,0 @@
-async function getRequest(url='') {
-    const response = await fetch(url, {
-      method: 'GET', 
-      cache: 'no-cache'
-    })
-}
-
-document.addEventListener('DOMContentLoaded', function() {
-
-let url = document.location
-let route = "/flaskwebgui-keep-server-alive";
-let interval_request = 3 * 1000; //sec
-
-function keep_alive_server(){
-    getRequest(url + route);
-}
-
-setInterval(keep_alive_server, interval_request);
-
-})
\ No newline at end of file
diff --git a/templates/index.html b/templates/index.html
index 27b50b78..253f6b8a 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -18,11 +18,8 @@
 	<script src="static/bootstrap.min.js"></script>
 	<script src="static/bootstrap-toggle.min.js"></script>
 	<script src="static/rangy-core.min.js"></script>
-	<script src="static/application.js?ver=1.18.1e"></script>
+	<script src="static/application.js?ver=1.18.1f"></script>
 	<script src="static/favicon.js"></script>
-	{% if flaskwebgui %}
-	<script src="static/flask_web_gui.js"></script>
-	{% endif %}
 </head>
 <body>
 	<input type="file" id="remote-save-select" accept="application/json" style="display:none">
@@ -295,12 +292,12 @@
 			<div id="loadmodellistcontent" style="overflow: auto; height: 300px;">
 			</div>
 			<div class="popupfooter">
-				<input class="form-control hidden" type="text" placeholder="key" id="modelkey" onblur="socket.send({'cmd': 'OAI_Key_Update', 'key': $('#modelkey')[0].value});">
 				<input class="form-control hidden" type="text" placeholder="Enter the URL of the server (For example a trycloudflare link)" id="modelurl" onchange="check_enable_model_load()">
+				<input class="form-control hidden" type="text" placeholder="key" id="modelkey" onblur="socket.send({'cmd': 'OAI_Key_Update', 'key': $('#modelkey')[0].value});">
 				<input class="form-control hidden" type="text" placeholder="Model Path or Hugging Face Name" id="custommodelname" menu="" onblur="socket.send({'cmd': 'selectmodel', 'data': $(this).attr('menu'), 'path_modelname': $('#custommodelname')[0].value});">
 			</div>
 			<div class="popupfooter">
-				<select class="form-control hidden" id="oaimodel"><option value="">Select OAI Model</option></select>
+				<select class="form-control hidden" id="oaimodel"><option value="">Select Model(s)</option></select>
 			</div>
 			<div class="popupfooter hidden" id=modellayers>
 				<div class='settingitem' style="width:100%">
diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py
index effb3de0..e8594721 100644
--- a/tpu_mtj_backend.py
+++ b/tpu_mtj_backend.py
@@ -30,7 +30,7 @@ SOFTWARE.
 import utils
 
 import multiprocessing
-from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, TypeVar
 import progressbar
 import time
 import os
@@ -45,9 +45,8 @@ from jax.config import config
 from jax.experimental import maps
 import jax.numpy as jnp
 import numpy as np
-import optax
 import haiku as hk
-from transformers import AutoTokenizer, GPT2TokenizerFast, AutoModelForCausalLM, GPTNeoForCausalLM
+from transformers import AutoTokenizer, GPT2Tokenizer, AutoModelForCausalLM, GPTNeoForCausalLM
 from tokenizers import Tokenizer
 from mesh_transformer.checkpoint import read_ckpt_lowmem
 from mesh_transformer.transformer_shard import CausalTransformer, CausalTransformerShard, PlaceholderTensor
@@ -136,6 +135,14 @@ def __batch_xmap(shard_dim=1):
     return inner
 
 
+class _EmptyState(NamedTuple):
+    pass
+
+class _DummyOptimizer:
+    def init(*args, **kwargs):
+        return _EmptyState()
+
+
 def apply_repetition_penalty_dynamic(logits, tokens, repetition_penalty, generated_index, gen_length, rpslope, rprange):
     '''
     This gets called by generate_loop_fn to apply repetition penalty
@@ -533,7 +540,7 @@ def sample_func(data, key, numseqs_aux, badwords, repetition_penalty, generated_
                 gen_length,
                 rpslope,
                 rprange,
-            )
+            ),
             **sampler_options,
         )
         # Remember what token was picked
@@ -1054,7 +1061,7 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
         "pe_rotary_dims": 64,
         "seq": 2048,
         "cores_per_replica": 8,
-        "tokenizer_class": "GPT2TokenizerFast",
+        "tokenizer_class": "GPT2Tokenizer",
         "tokenizer": "gpt2",
     }
     params = kwargs
@@ -1072,7 +1079,7 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
             "pe_rotary_dims": 24,
             "seq": 2048,
             "cores_per_replica": 8,
-            "tokenizer_class": "GPT2TokenizerFast",
+            "tokenizer_class": "GPT2Tokenizer",
             "tokenizer": "gpt2",
         }
 
@@ -1167,7 +1174,7 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
 
     cores_per_replica = params["cores_per_replica"]
     seq = params["seq"]
-    params["optimizer"] = optax.scale(0)
+    params["optimizer"] = _DummyOptimizer()
     mesh_shape = (1, cores_per_replica)
     devices = np.array(jax.devices()[:cores_per_replica]).reshape(mesh_shape)
     thread_resources_env = maps.ResourceEnv(maps.Mesh(devices, ('dp', 'mp')), ())
@@ -1343,49 +1350,46 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo
     print("\n", flush=True)
     with torch_lazy_loader.use_lazy_torch_load(callback=callback, dematerialized_modules=True):
         if(os.path.isdir(vars.custmodpth)):
-            try:
-                tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
-            except Exception as e:
-                pass
             try:
                 tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache", use_fast=False)
             except Exception as e:
                 try:
-                    tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                    tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
                 except Exception as e:
-                    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+                    try:
+                        tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
+                    except Exception as e:
+                        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
             try:
                 model     = AutoModelForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
             except Exception as e:
                 model     = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, revision=vars.revision, cache_dir="cache")
         elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))):
-            try:
-                tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
-            except Exception as e:
-                pass
             try:
                 tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache", use_fast=False)
             except Exception as e:
                 try:
-                    tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
+                    tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
                 except Exception as e:
-                    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+                    try:
+                        tokenizer = GPT2Tokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
+                    except Exception as e:
+                        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
             try:
                 model     = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
             except Exception as e:
                 model     = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), revision=vars.revision, cache_dir="cache")
         else:
-            try:
-                tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
-            except Exception as e:
-                pass
             try:
                 tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache", use_fast=False)
             except Exception as e:
                 try:
-                    tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
+                    tokenizer = AutoTokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
                 except Exception as e:
-                    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
+                    try:
+                        tokenizer = GPT2Tokenizer.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
+                    except Exception as e:
+                        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=vars.revision, cache_dir="cache")
             try:
                 model     = AutoModelForCausalLM.from_pretrained(vars.model, revision=vars.revision, cache_dir="cache")
             except Exception as e:
diff --git a/utils.py b/utils.py
index 72cd12d1..baa74add 100644
--- a/utils.py
+++ b/utils.py
@@ -4,6 +4,7 @@ import shutil
 import json
 import subprocess
 import tempfile
+from urllib.error import HTTPError
 import requests
 import requests.adapters
 import time
@@ -13,6 +14,10 @@ import packaging.version
 from tqdm.auto import tqdm
 import os
 import itertools
+import hashlib
+import huggingface_hub
+import packaging.version
+from pathlib import Path
 from typing import List, Optional
 
 HAS_ACCELERATE = packaging.version.parse(transformers_version) >= packaging.version.parse("4.20.0.dev0")
@@ -176,91 +181,28 @@ def num_layers(config):
 #  Downloads huggingface checkpoints using aria2c if possible
 #==================================================================#
 from flask_socketio import emit
-class Send_to_socketio(object):
-    def write(self, bar):
-        time.sleep(0.01)
-        try:
-            print(bar)
-            emit('from_server', {'cmd': 'model_load_status', 'data': bar.replace(" ", "&nbsp;")}, broadcast=True)
-        except:
-            pass
             
-def aria2_hook(pretrained_model_name_or_path: str, force_download=False, cache_dir=None, proxies=None, resume_download=False, local_files_only=False, use_auth_token=None, user_agent=None, revision=None, mirror=None, **kwargs):
+def _download_with_aria2(aria2_config: str, total_length: int, directory: str = ".", user_agent=None, force_download=False, use_auth_token=None):
+    class Send_to_socketio(object):
+        def write(self, bar):
+            bar = bar.replace("\r", "").replace("\n", "")
+            
+            if bar != "":
+                try:
+                    print('\r' + bar, end='')
+                    try:
+                        emit('from_server', {'cmd': 'model_load_status', 'data': bar.replace(" ", "&nbsp;")}, broadcast=True)
+                    except:
+                        pass
+                    eventlet.sleep(seconds=0)
+                except:
+                    pass
+        def flush(self):
+            pass
+    
     import transformers
-    import transformers.modeling_utils
-    from huggingface_hub import HfFolder
-    if shutil.which("aria2c") is None:  # Don't do anything if aria2 is not installed
-        return
-    if local_files_only:  # If local_files_only is true, we obviously don't need to download anything
-        return
-    if os.path.isdir(pretrained_model_name_or_path) or os.path.isfile(pretrained_model_name_or_path) or os.path.isfile(pretrained_model_name_or_path + ".index") or transformers.modeling_utils.is_remote_url(pretrained_model_name_or_path):
-        return
-    if proxies:
-        print("WARNING:  KoboldAI does not support using aria2 to download models from huggingface.co through a proxy.  Disabling aria2 download mode.")
-        return
-    if use_auth_token:
-        if isinstance(use_auth_token, str):
-            token = use_auth_token
-        else:
-            token = HfFolder.get_token()
-            if token is None:
-                raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
     aria2_port = 6799 if vars is None else vars.aria2_port
-    _cache_dir = str(cache_dir) if cache_dir is not None else transformers.TRANSFORMERS_CACHE
-    sharded = False
-    headers = {"user-agent": transformers.file_utils.http_user_agent(user_agent)}
-    if use_auth_token:
-        headers["authorization"] = f"Bearer {use_auth_token}"
-    def is_cached(url):
-        try:
-            transformers.file_utils.get_from_cache(url, cache_dir=cache_dir, local_files_only=True)
-        except (FileNotFoundError, transformers.file_utils.EntryNotFoundError):
-            return False
-        return True
-    while True:  # Try to get the huggingface.co URL of the model's pytorch_model.bin or pytorch_model.bin.index.json file
-        try:
-            filename = transformers.modeling_utils.WEIGHTS_INDEX_NAME if sharded else transformers.modeling_utils.WEIGHTS_NAME
-        except AttributeError:
-            return
-        url = transformers.file_utils.hf_bucket_url(pretrained_model_name_or_path, filename, revision=revision, mirror=mirror)
-        if is_cached(url) or requests.head(url, allow_redirects=True, proxies=proxies, headers=headers):
-            break
-        if sharded:
-            return
-        else:
-            sharded = True
-    if not sharded:  # If the model has a pytorch_model.bin file, that's the only file to download
-        filenames = [transformers.modeling_utils.WEIGHTS_NAME]
-    else:  # Otherwise download the pytorch_model.bin.index.json and then let aria2 download all the pytorch_model-#####-of-#####.bin files mentioned inside it
-        map_filename = transformers.file_utils.cached_path(url, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, use_auth_token=use_auth_token, user_agent=user_agent)
-        with open(map_filename) as f:
-            map_data = json.load(f)
-        filenames = set(map_data["weight_map"].values())
-    urls = [transformers.file_utils.hf_bucket_url(pretrained_model_name_or_path, n, revision=revision, mirror=mirror) for n in filenames]
-    if not force_download:
-        urls = [u for u in urls if not is_cached(u)]
-        if not urls:
-            return
-    etags = [h.get("X-Linked-Etag") or h.get("ETag") for u in urls for h in [requests.head(u, headers=headers, allow_redirects=False, proxies=proxies, timeout=10).headers]]
-    headers = [requests.head(u, headers=headers, allow_redirects=True, proxies=proxies, timeout=10).headers for u in urls]
-    filenames = [transformers.file_utils.url_to_filename(u, t) for u, t in zip(urls, etags)]
-    for n in filenames:
-        path = os.path.join(_cache_dir, "kai-tempfile." + n + ".aria2")
-        if os.path.exists(path):
-            os.remove(path)
-        path = os.path.join(_cache_dir, "kai-tempfile." + n)
-        if os.path.exists(path):
-            os.remove(path)
-        if force_download:
-            path = os.path.join(_cache_dir, n + ".json")
-            if os.path.exists(path):
-                os.remove(path)
-            path = os.path.join(_cache_dir, n)
-            if os.path.exists(path):
-                os.remove(path)
-    total_length = sum(int(h["Content-Length"]) for h in headers)
     lengths = {}
-    aria2_config = "\n".join(f"{u}\n  out=kai-tempfile.{n}" for u, n in zip(urls, filenames)).encode()
     s = requests.Session()
     s.mount("http://", requests.adapters.HTTPAdapter(max_retries=requests.adapters.Retry(total=120, backoff_factor=1)))
     bar = None
@@ -270,7 +212,7 @@ def aria2_hook(pretrained_model_name_or_path: str, force_download=False, cache_d
         with tempfile.NamedTemporaryFile("w+b", delete=False) as f:
             f.write(aria2_config)
             f.flush()
-            p = subprocess.Popen(["aria2c", "-x", "10", "-s", "10", "-j", "10", "--enable-rpc=true", f"--rpc-secret={secret}", "--rpc-listen-port", str(aria2_port), "--disable-ipv6", "--file-allocation=trunc", "--allow-overwrite", "--auto-file-renaming=false", "-d", _cache_dir, "-i", f.name, "-U", transformers.file_utils.http_user_agent(user_agent)] + (["-c"] if not force_download else []) + ([f"--header='Authorization: Bearer {token}'"] if use_auth_token else []), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            p = subprocess.Popen(["aria2c", "-x", "10", "-s", "10", "-j", "10", "--enable-rpc=true", f"--rpc-secret={secret}", "--rpc-listen-port", str(aria2_port), "--disable-ipv6", "--file-allocation=trunc", "--allow-overwrite", "--auto-file-renaming=false", "-d", directory, "-i", f.name, "-U", transformers.file_utils.http_user_agent(user_agent)] + (["-c"] if not force_download else []) + ([f"--header='Authorization: Bearer {use_auth_token}'"] if use_auth_token else []), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
             while p.poll() is None:
                 r = s.post(f"http://localhost:{aria2_port}/jsonrpc", json={"jsonrpc": "2.0", "id": "kai", "method": "aria2.tellActive", "params": [f"token:{secret}"]}).json()["result"]
                 if not r:
@@ -306,6 +248,291 @@ def aria2_hook(pretrained_model_name_or_path: str, force_download=False, cache_d
     code = p.wait()
     if not done and code:
         raise OSError(f"aria2 exited with exit code {code}")
+
+def _transformers22_aria2_hook(pretrained_model_name_or_path: str, force_download=False, cache_dir=None, proxies=None, resume_download=False, local_files_only=False, use_auth_token=None, user_agent=None, revision=None, **kwargs):
+    import transformers
+    import transformers.modeling_utils
+    from huggingface_hub import HfFolder
+    if use_auth_token:
+        if isinstance(use_auth_token, str):
+            token = use_auth_token
+        else:
+            token = HfFolder.get_token()
+            if token is None:
+                raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
+    _cache_dir = str(cache_dir) if cache_dir is not None else transformers.TRANSFORMERS_CACHE
+    _revision = revision if revision is not None else huggingface_hub.constants.DEFAULT_REVISION
+    sharded = False
+    headers = {"user-agent": transformers.file_utils.http_user_agent(user_agent)}
+    if use_auth_token:
+        headers["authorization"] = f"Bearer {use_auth_token}"
+
+    storage_folder = os.path.join(_cache_dir, huggingface_hub.file_download.repo_folder_name(repo_id=pretrained_model_name_or_path, repo_type="model"))
+    os.makedirs(storage_folder, exist_ok=True)
+
+    def is_cached(filename):
+        try:
+            huggingface_hub.hf_hub_download(pretrained_model_name_or_path, filename, cache_dir=cache_dir, local_files_only=True)
+        except ValueError:
+            return False
+        return True
+    while True:  # Try to get the huggingface.co URL of the model's pytorch_model.bin or pytorch_model.bin.index.json file
+        try:
+            filename = transformers.modeling_utils.WEIGHTS_INDEX_NAME if sharded else transformers.modeling_utils.WEIGHTS_NAME
+        except AttributeError:
+            return
+        url = huggingface_hub.hf_hub_url(pretrained_model_name_or_path, filename, revision=revision)
+        if is_cached(filename) or requests.head(url, allow_redirects=True, proxies=proxies, headers=headers):
+            break
+        if sharded:
+            return
+        else:
+            sharded = True
+    if not sharded:  # If the model has a pytorch_model.bin file, that's the only file to download
+        filenames = [transformers.modeling_utils.WEIGHTS_NAME]
+    else:  # Otherwise download the pytorch_model.bin.index.json and then let aria2 download all the pytorch_model-#####-of-#####.bin files mentioned inside it
+        map_filename = huggingface_hub.hf_hub_download(pretrained_model_name_or_path, filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, use_auth_token=use_auth_token, user_agent=user_agent)
+        with open(map_filename) as f:
+            map_data = json.load(f)
+        filenames = set(map_data["weight_map"].values())
+    urls = [huggingface_hub.hf_hub_url(pretrained_model_name_or_path, n, revision=revision) for n in filenames]
+    if not force_download:
+        urls = [u for u, n in zip(urls, filenames) if not is_cached(n)]
+        if not urls:
+            return
+    
+    blob_paths = []
+
+    # This section is a modified version of hf_hub_download from huggingface_hub
+    # See https://github.com/huggingface/huggingface_hub/blob/main/LICENSE for license
+    for u, n in zip(urls, filenames):
+        relative_filename = os.path.join(*n.split("/"))
+        if not local_files_only:
+            try:
+                r = huggingface_hub.file_download._request_wrapper(
+                    method="HEAD",
+                    url=u,
+                    headers=headers,
+                    allow_redirects=False,
+                    follow_relative_redirects=True,
+                    proxies=proxies,
+                    timeout=10,
+                )
+                try:
+                    r.raise_for_status()
+                except HTTPError as e:
+                    error_code = r.headers.get("X-Error-Code")
+                    if error_code != "EntryNotFound":
+                        raise RuntimeError(f"HEAD {u} failed with error code {r.status_code}")
+                    commit_hash = r.headers.get(huggingface_hub.file_download.HUGGINGFACE_HEADER_X_REPO_COMMIT)
+                    if commit_hash is not None:
+                        no_exist_file_path = (
+                            Path(storage_folder)
+                            / ".no_exist"
+                            / commit_hash
+                            / relative_filename
+                        )
+                        no_exist_file_path.parent.mkdir(parents=True, exist_ok=True)
+                        no_exist_file_path.touch()
+                        huggingface_hub.file_download._cache_commit_hash_for_specific_revision(
+                            storage_folder, _revision, commit_hash
+                        )
+                    raise
+                commit_hash = r.headers[huggingface_hub.file_download.HUGGINGFACE_HEADER_X_REPO_COMMIT]
+                if commit_hash is None:
+                    raise OSError(
+                        "Distant resource does not seem to be on huggingface.co (missing"
+                        " commit header)."
+                    )
+                etag = r.headers.get(huggingface_hub.file_download.HUGGINGFACE_HEADER_X_LINKED_ETAG) or r.headers.get(
+                    "ETag"
+                )
+                # We favor a custom header indicating the etag of the linked resource, and
+                # we fallback to the regular etag header.
+                # If we don't have any of those, raise an error.
+                if etag is None:
+                    raise OSError(
+                        "Distant resource does not have an ETag, we won't be able to"
+                        " reliably ensure reproducibility."
+                    )
+                etag = huggingface_hub.file_download._normalize_etag(etag)
+                # In case of a redirect, save an extra redirect on the request.get call,
+                # and ensure we download the exact atomic version even if it changed
+                # between the HEAD and the GET (unlikely, but hey).
+                # Useful for lfs blobs that are stored on a CDN.
+                if 300 <= r.status_code <= 399:
+                    url_to_download = r.headers["Location"]
+                    if (
+                        "lfs.huggingface.co" in url_to_download
+                        or "lfs-staging.huggingface.co" in url_to_download
+                    ):
+                        # Remove authorization header when downloading a LFS blob
+                        headers.pop("authorization", None)
+            except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
+                # Actually raise for those subclasses of ConnectionError
+                raise
+            except (
+                requests.exceptions.ConnectionError,
+                requests.exceptions.Timeout,
+                huggingface_hub.file_download.OfflineModeIsEnabled,
+            ):
+                # Otherwise, our Internet connection is down.
+                # etag is None
+                pass
+        if etag is None:
+            # In those cases, we cannot force download.
+            if force_download:
+                raise ValueError(
+                    "We have no connection or you passed local_files_only, so"
+                    " force_download is not an accepted option."
+                )
+            if huggingface_hub.file_download.REGEX_COMMIT_HASH.match(_revision):
+                commit_hash = _revision
+            else:
+                ref_path = os.path.join(storage_folder, "refs", _revision)
+                with open(ref_path) as f:
+                    commit_hash = f.read()
+            pointer_path = os.path.join(
+                storage_folder, "snapshots", commit_hash, relative_filename
+            )
+            if os.path.exists(pointer_path):
+                return pointer_path
+            # If we couldn't find an appropriate file on disk,
+            # raise an error.
+            # If files cannot be found and local_files_only=True,
+            # the models might've been found if local_files_only=False
+            # Notify the user about that
+            if local_files_only:
+                raise huggingface_hub.file_download.LocalEntryNotFoundError(
+                    "Cannot find the requested files in the disk cache and"
+                    " outgoing traffic has been disabled. To enable hf.co look-ups"
+                    " and downloads online, set 'local_files_only' to False."
+                )
+            else:
+                raise huggingface_hub.file_download.LocalEntryNotFoundError(
+                    "Connection error, and we cannot find the requested files in"
+                    " the disk cache. Please try again or make sure your Internet"
+                    " connection is on."
+                )
+        # From now on, etag and commit_hash are not None.
+        blob_path = os.path.join(storage_folder, "blobs", etag)
+        pointer_path = os.path.join(
+            storage_folder, "snapshots", commit_hash, relative_filename
+        )
+        os.makedirs(os.path.dirname(blob_path), exist_ok=True)
+        os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
+        # if passed revision is not identical to commit_hash
+        # then revision has to be a branch name or tag name.
+        # In that case store a ref.
+        huggingface_hub.file_download._cache_commit_hash_for_specific_revision(storage_folder, _revision, commit_hash)
+        if os.path.exists(pointer_path) and not force_download:
+            return pointer_path
+        if os.path.exists(blob_path) and not force_download:
+            # we have the blob already, but not the pointer
+            huggingface_hub.file_download.logger.info("creating pointer to %s from %s", blob_path, pointer_path)
+            huggingface_hub.file_download._create_relative_symlink(blob_path, pointer_path)
+            return pointer_path
+        # Some Windows versions do not allow for paths longer than 255 characters.
+        # In this case, we must specify it is an extended path by using the "\\?\" prefix.
+        if os.name == "nt" and len(os.path.abspath(blob_path)) > 255:
+            blob_path = "\\\\?\\" + os.path.abspath(blob_path)
+        blob_paths.append(blob_path)
+
+    filenames = blob_paths
+    headers = [requests.head(u, headers=headers, allow_redirects=True, proxies=proxies, timeout=10).headers for u in urls]
+
+    for n in filenames:
+        prefix, suffix = n.rsplit(os.sep, 1)
+        path = os.path.join(prefix, "kai-tempfile." + suffix + ".aria2")
+        if os.path.exists(path):
+            os.remove(path)
+        path = os.path.join(prefix, "kai-tempfile." + suffix)
+        if os.path.exists(path):
+            os.remove(path)
+    total_length = sum(int(h["Content-Length"]) for h in headers)
+    aria2_config = "\n".join(f"{u}\n  out={os.path.join(prefix, 'kai-tempfile.' + suffix)}" for u, n in zip(urls, filenames) for prefix, suffix in [n.rsplit(os.sep, 1)]).encode()
+    _download_with_aria2(aria2_config, total_length, use_auth_token=token if use_auth_token else None, user_agent=user_agent, force_download=force_download)
+    for u, n in zip(urls, filenames):
+        prefix, suffix = n.rsplit(os.sep, 1)
+        os.rename(os.path.join(prefix, "kai-tempfile." + suffix), os.path.join(prefix, suffix))
+
+def aria2_hook(pretrained_model_name_or_path: str, force_download=False, cache_dir=None, proxies=None, resume_download=False, local_files_only=False, use_auth_token=None, user_agent=None, revision=None, **kwargs):
+    import transformers
+    import transformers.modeling_utils
+    from huggingface_hub import HfFolder
+    if shutil.which("aria2c") is None:  # Don't do anything if aria2 is not installed
+        return
+    if local_files_only:  # If local_files_only is true, we obviously don't need to download anything
+        return
+    if os.path.isdir(pretrained_model_name_or_path) or os.path.isfile(pretrained_model_name_or_path) or os.path.isfile(pretrained_model_name_or_path + ".index") or transformers.modeling_utils.is_remote_url(pretrained_model_name_or_path):
+        return
+    if proxies:
+        print("WARNING:  KoboldAI does not support using aria2 to download models from huggingface.co through a proxy.  Disabling aria2 download mode.")
+        return
+    if packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.22.0.dev0"):
+        return _transformers22_aria2_hook(pretrained_model_name_or_path, force_download=force_download, cache_dir=cache_dir, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, use_auth_token=use_auth_token, revision=revision, **kwargs)
+    if use_auth_token:
+        if isinstance(use_auth_token, str):
+            token = use_auth_token
+        else:
+            token = HfFolder.get_token()
+            if token is None:
+                raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
+    _cache_dir = str(cache_dir) if cache_dir is not None else transformers.TRANSFORMERS_CACHE
+    sharded = False
+    headers = {"user-agent": transformers.file_utils.http_user_agent(user_agent)}
+    if use_auth_token:
+        headers["authorization"] = f"Bearer {use_auth_token}"
+    def is_cached(url):
+        try:
+            huggingface_hub.cached_download(url, cache_dir=cache_dir, local_files_only=True)
+        except ValueError:
+            return False
+        return True
+    while True:  # Try to get the huggingface.co URL of the model's pytorch_model.bin or pytorch_model.bin.index.json file
+        try:
+            filename = transformers.modeling_utils.WEIGHTS_INDEX_NAME if sharded else transformers.modeling_utils.WEIGHTS_NAME
+        except AttributeError:
+            return
+        url = huggingface_hub.hf_hub_url(pretrained_model_name_or_path, filename, revision=revision)
+        if is_cached(url) or requests.head(url, allow_redirects=True, proxies=proxies, headers=headers):
+            break
+        if sharded:
+            return
+        else:
+            sharded = True
+    if not sharded:  # If the model has a pytorch_model.bin file, that's the only file to download
+        filenames = [transformers.modeling_utils.WEIGHTS_NAME]
+    else:  # Otherwise download the pytorch_model.bin.index.json and then let aria2 download all the pytorch_model-#####-of-#####.bin files mentioned inside it
+        map_filename = huggingface_hub.cached_download(url, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, use_auth_token=use_auth_token, user_agent=user_agent)
+        with open(map_filename) as f:
+            map_data = json.load(f)
+        filenames = set(map_data["weight_map"].values())
+    urls = [huggingface_hub.hf_hub_url(pretrained_model_name_or_path, n, revision=revision) for n in filenames]
+    if not force_download:
+        urls = [u for u in urls if not is_cached(u)]
+        if not urls:
+            return
+    etags = [h.get("X-Linked-Etag") or h.get("ETag") for u in urls for h in [requests.head(u, headers=headers, allow_redirects=False, proxies=proxies, timeout=10).headers]]
+    headers = [requests.head(u, headers=headers, allow_redirects=True, proxies=proxies, timeout=10).headers for u in urls]
+    filenames = [hashlib.sha256(u.encode("utf-8")).hexdigest() + "." + hashlib.sha256(t.encode("utf-8")).hexdigest() for u, t in zip(urls, etags)]
+    for n in filenames:
+        path = os.path.join(_cache_dir, "kai-tempfile." + n + ".aria2")
+        if os.path.exists(path):
+            os.remove(path)
+        path = os.path.join(_cache_dir, "kai-tempfile." + n)
+        if os.path.exists(path):
+            os.remove(path)
+        if force_download:
+            path = os.path.join(_cache_dir, n + ".json")
+            if os.path.exists(path):
+                os.remove(path)
+            path = os.path.join(_cache_dir, n)
+            if os.path.exists(path):
+                os.remove(path)
+    total_length = sum(int(h["Content-Length"]) for h in headers)
+    aria2_config = "\n".join(f"{u}\n  out=kai-tempfile.{n}" for u, n in zip(urls, filenames)).encode()
+    _download_with_aria2(aria2_config, total_length, directory=_cache_dir, use_auth_token=token if use_auth_token else None, user_agent=user_agent, force_download=force_download)
     for u, t, n in zip(urls, etags, filenames):
         os.rename(os.path.join(_cache_dir, "kai-tempfile." + n), os.path.join(_cache_dir, n))
         with open(os.path.join(_cache_dir, n + ".json"), "w") as f:
@@ -325,10 +552,10 @@ def get_num_shards(filename):
 #  pytorch_model.bin.index.json, returns a list of weight names in the
 #  sharded model.  Requires lazy loader to be enabled to work properl
 #==================================================================#
-def get_sharded_checkpoint_num_tensors(pretrained_model_name_or_path, filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, local_files_only=False, use_auth_token=None, user_agent=None, revision=None, mirror=None, **kwargs):
+def get_sharded_checkpoint_num_tensors(pretrained_model_name_or_path, filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, local_files_only=False, use_auth_token=None, user_agent=None, revision=None, **kwargs):
     import transformers.modeling_utils
     import torch
-    shard_paths, _ = transformers.modeling_utils.get_checkpoint_shard_files(pretrained_model_name_or_path, filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, use_auth_token=use_auth_token, user_agent=user_agent, revision=revision, mirror=mirror)
+    shard_paths, _ = transformers.modeling_utils.get_checkpoint_shard_files(pretrained_model_name_or_path, filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, use_auth_token=use_auth_token, user_agent=user_agent, revision=revision)
     return list(itertools.chain(*(torch.load(p, map_location="cpu").keys() for p in shard_paths)))
 
 #==================================================================#
@@ -379,4 +606,4 @@ def get_missing_module_names(model: PreTrainedModel, names: List[str]) -> List[s
             else:
                 recurse(c[1], head=name + ".")
     recurse(model)
-    return missing_names
+    return missing_names
\ No newline at end of file