Merge branch 'united' into merge/united-exllama

2025-06-05 21:59:24 +02:00 · 2023-08-28 09:32:19 -07:00
parent 5229987ab7 dda5acd5d5
commit 6151cbd053
28 changed files with 1799 additions and 894 deletions
--- a/README_GPTQ.md
+++ b/README_GPTQ.md
@@ -2,31 +2,13 @@
 (This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)

 #### Installation
-In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
+For Nvidia users everything is automatically installed when you install the requirements, you merely need a compatible GPTQ model for it to show up.

-Note: do not run your command prompt as administrator/with elevated priviledges, reports suggest this leads to problems.
-
-`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
-
-`cd KoboldAI`
-
-Next step, (Windows) subfolder mode or B: option doesn't matter choose either
-
-* [if on Windows]
-  ```
-  install_requirements.bat
-  ```
-  * if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
-
-* [if on Linux with Nvidia] 
-  ```
-  ./install_requirements.sh
-  ```
 * [if on Linux with AMD]
  ```
  ./install_requirements.sh rocm
  ./commandline-rocm.sh
-  pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa@c884b421a233f9603d8224c9b22c2d83dd2c1fc4
+  pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa
  ```
  * If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed
  * If you get CUDA_HOME envar is not set run in env: 
@@ -46,5 +28,5 @@ If you haven't done so already, exit the command prompt/leave KAI's conda env. (

 Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD]

-Switch to UI2, then load your model.
+Load your model using Huggingface GPTQ as the backend option (This will show up when a valid GPTQ model is detected).

--- a/aiserver.py
+++ b/aiserver.py
@@ -12,6 +12,8 @@ import random
 import shutil
 import eventlet

+from modeling.inference_model import GenerationMode
+
 eventlet.monkey_patch(all=True, thread=False, os=False)
 import os, inspect, contextlib, pickle
 os.system("")
@@ -71,6 +73,12 @@ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForToken
 import transformers
 import ipaddress
 from functools import wraps
+from modeling.pickling import RestrictedUnpickler, use_custom_unpickler
+
+# Make settings folder early so we can depend on it anywhere
+if not os.path.exists("settings/"):
+    os.mkdir("settings")
+
 try:
    from transformers.models.opt.modeling_opt import OPTDecoder
 except:
@@ -630,7 +638,10 @@ model_backends = {}
 model_backend_module_names = {}
 model_backend_type_crosswalk = {}

-PRIORITIZED_BACKEND_MODULES = ["generic_hf_torch"]
+PRIORITIZED_BACKEND_MODULES = {
+    "gptq_hf_torch": 2,
+    "generic_hf_torch": 1
+}

 for module in os.listdir("./modeling/inference_models"):
    if module == '__pycache__':
@@ -666,10 +677,15 @@ for module in os.listdir("./modeling/inference_models"):
        model_backend_module_names[backend_name] = module

        if backend_type in model_backend_type_crosswalk:
-            if module in PRIORITIZED_BACKEND_MODULES:
-                model_backend_type_crosswalk[backend_type].insert(0, backend_name)
-            else:
-                model_backend_type_crosswalk[backend_type].append(backend_name)
+            model_backend_type_crosswalk[backend_type].append(backend_name)
+            model_backend_type_crosswalk[backend_type] = list(sorted(
+                model_backend_type_crosswalk[backend_type],
+                key=lambda name: PRIORITIZED_BACKEND_MODULES.get(
+                    [mod for b_name, mod in model_backend_module_names.items() if b_name == name][0],
+                    0
+                ),
+                reverse=True
+            ))
        else:
            model_backend_type_crosswalk[backend_type] = [backend_name]

@@ -892,7 +908,7 @@ tags = [
 api_version = None  # This gets set automatically so don't change this value

 api_v1 = KoboldAPISpec(
-    version="1.2.2",
+    version="1.2.3",
    prefixes=["/api/v1", "/api/latest"],
    tags=tags,
 )
@@ -1670,75 +1686,7 @@ def unload_model():
    #Reload our badwords
    koboldai_vars.badwordsids = koboldai_settings.badwordsids_default

-class RestrictedUnpickler(pickle.Unpickler):
-    def original_persistent_load(self, saved_id):
-        return super().persistent_load(saved_id)

-    def forced_persistent_load(self, saved_id):
-        if saved_id[0] != "storage":
-            raise pickle.UnpicklingError("`saved_id[0]` must be 'storage'")
-        return self.original_persistent_load(saved_id)
-
-    def find_class(self, module, name):
-        if module == "collections" and name == "OrderedDict":
-            return collections.OrderedDict
-        elif module == "torch._utils" and name == "_rebuild_tensor_v2":
-            return torch._utils._rebuild_tensor_v2
-        elif module == "torch._tensor" and name == "_rebuild_from_type_v2":
-            return torch._tensor._rebuild_from_type_v2
-        elif module == "torch" and name in (
-            "DoubleStorage",
-            "FloatStorage",
-            "HalfStorage",
-            "LongStorage",
-            "IntStorage",
-            "ShortStorage",
-            "CharStorage",
-            "ByteStorage",
-            "BoolStorage",
-            "BFloat16Storage",
-            "Tensor",
-        ):
-            return getattr(torch, name)
-        elif module == "numpy.core.multiarray" and name == "scalar":
-            return np.core.multiarray.scalar
-        elif module == "numpy" and name == "dtype":
-            return np.dtype
-        elif module == "_codecs" and name == "encode":
-            return _codecs.encode
-        else:
-            # Forbid everything else.
-            qualified_name = name if module == "__builtin__" else f"{module}.{name}"
-            raise pickle.UnpicklingError(
-                f"`{qualified_name}` is forbidden; the model you are loading probably contains malicious code. If you think this is incorrect ask the developer to unban the ability for {module} to execute {name}"
-            )
-
-    def load(self, *args, **kwargs):
-        self.original_persistent_load = getattr(
-            self, "persistent_load", pickle.Unpickler.persistent_load
-        )
-        self.persistent_load = self.forced_persistent_load
-        return super().load(*args, **kwargs)
-    
-@contextlib.contextmanager
-def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler):
-    try:
-        old_unpickler = pickle.Unpickler
-        pickle.Unpickler = unpickler
-
-        old_pickle_load = pickle.load
-
-        def new_pickle_load(*args, **kwargs):
-            return pickle.Unpickler(*args, **kwargs).load()
-
-        pickle.load = new_pickle_load
-
-        yield
-
-    finally:
-        pickle.Unpickler = old_unpickler
-        pickle.load = old_pickle_load
-    
 def load_model(model_backend, initial_load=False):
    global model
    global tokenizer
@@ -1747,9 +1695,6 @@ def load_model(model_backend, initial_load=False):
    koboldai_vars.aibusy = True
    koboldai_vars.horde_share = False

-    if initial_load:
-        use_breakmodel_args = True
-
    koboldai_vars.reset_model()

    koboldai_vars.noai = False
@@ -1788,7 +1733,9 @@ def load_model(model_backend, initial_load=False):
    
    with use_custom_unpickler(RestrictedUnpickler):
        model = model_backends[model_backend]
+        koboldai_vars.supported_gen_modes = [x.value for x in model.get_supported_gen_modes()]
        model.load(initial_load=initial_load, save_model=not (args.colab or args.cacheonly) or args.savemodel)
+
    koboldai_vars.model = model.model_name if "model_name" in vars(model) else model.id #Should have model_name, but it could be set to id depending on how it's setup
    if koboldai_vars.model in ("NeoCustom", "GPT2Custom", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"):
        koboldai_vars.model = os.path.basename(os.path.normpath(model.path))
@@ -1889,8 +1836,8 @@ def load_model(model_backend, initial_load=False):
        os.mkdir("./softprompts")
    koboldai_vars.splist = [[f, get_softprompt_desc(os.path.join("./softprompts", f),None,True)] for f in os.listdir("./softprompts") if os.path.isfile(os.path.join("./softprompts", f)) and valid_softprompt(os.path.join("./softprompts", f))]
    if initial_load and koboldai_vars.cloudflare_link != "":
-        logger.message(f"KoboldAI has finished loading and is available at the following link for UI 1: {koboldai_vars.cloudflare_link}")
-        logger.message(f"KoboldAI has finished loading and is available at the following link for UI 2: {koboldai_vars.cloudflare_link}/new_ui")
+        logger.message(f"KoboldAI has finished loading and is available at the following link: {koboldai_vars.cloudflare_link}")
+        logger.message(f"KoboldAI has finished loading and is available at the following link for the Classic UI: {koboldai_vars.cloudflare_link}/classic")
        logger.message(f"KoboldAI has finished loading and is available at the following link for KoboldAI Lite: {koboldai_vars.cloudflare_link}/lite")
        logger.message(f"KoboldAI has finished loading and is available at the following link for the API: {koboldai_vars.cloudflare_link}/api")

@@ -1922,8 +1869,7 @@ def require_allowed_ip(func):


 # Set up Flask routes
-@app.route('/')
-@app.route('/index')
+@app.route('/classic')
@require_allowed_ip
 def index():
    if args.no_ui:
@@ -3267,11 +3213,20 @@ def check_for_backend_compilation():
            break
    koboldai_vars.checking = False

-def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False, disable_recentrng=False, no_generate=False, ignore_aibusy=False):
+def actionsubmit(
+    data,
+    actionmode=0,
+    force_submit=False,
+    force_prompt_gen=False,
+    disable_recentrng=False,
+    no_generate=False,
+    ignore_aibusy=False,
+    gen_mode=GenerationMode.STANDARD
+):
    # Ignore new submissions if the AI is currently busy
-    if(koboldai_vars.aibusy):
+    if koboldai_vars.aibusy and not ignore_aibusy:
        return
-    
+
    while(True):
        set_aibusy(1)
        koboldai_vars.actions.clear_unused_options()
@@ -3359,7 +3314,7 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False,
                koboldai_vars.prompt = data
                # Clear the startup text from game screen
                emit('from_server', {'cmd': 'updatescreen', 'gamestarted': False, 'data': 'Please wait, generating story...'}, broadcast=True, room="UI_1")
-                calcsubmit("") # Run the first action through the generator
+                calcsubmit("", gen_mode=gen_mode) # Run the first action through the generator
                if(not koboldai_vars.abort and koboldai_vars.lua_koboldbridge.restart_sequence is not None and len(koboldai_vars.genseqs) == 0):
                    data = ""
                    force_submit = True
@@ -3425,7 +3380,7 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False,

            if(not no_generate and not koboldai_vars.noai and koboldai_vars.lua_koboldbridge.generating):
                # Off to the tokenizer!
-                calcsubmit("")
+                calcsubmit("", gen_mode=gen_mode)
                if(not koboldai_vars.abort and koboldai_vars.lua_koboldbridge.restart_sequence is not None and len(koboldai_vars.genseqs) == 0):
                    data = ""
                    force_submit = True
@@ -3780,7 +3735,7 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
 #==================================================================#
 # Take submitted text and build the text to be given to generator
 #==================================================================#
-def calcsubmit(txt):
+def calcsubmit(txt, gen_mode=GenerationMode.STANDARD):
    anotetxt     = ""    # Placeholder for Author's Note text
    forceanote   = False # In case we don't have enough actions to hit A.N. depth
    anoteadded   = False # In case our budget runs out before we hit A.N. depth
@@ -3822,7 +3777,7 @@ def calcsubmit(txt):
        logger.debug("Submit: experimental_features time {}s".format(time.time()-start_time))
        
        start_time = time.time()
-        generate(subtxt, min, max, found_entries)
+        generate(subtxt, min, max, found_entries, gen_mode=gen_mode)
        logger.debug("Submit: generate time {}s".format(time.time()-start_time))
        attention_bias.attention_bias = None

@@ -3890,7 +3845,14 @@ class HordeException(Exception):
 # Send text to generator and deal with output
 #==================================================================#

-def generate(txt, minimum, maximum, found_entries=None):    
+def generate(txt, minimum, maximum, found_entries=None, gen_mode=GenerationMode.STANDARD):
+    # Open up token stream
+    emit("stream_tokens", True, broadcast=True, room="UI_2")
+
+    # HACK: Show options when streaming more than 1 sequence
+    if utils.koboldai_vars.output_streaming:
+        koboldai_vars.actions.show_options(koboldai_vars.numseqs > 1, force=True)
+
    koboldai_vars.generated_tkns = 0

    if(found_entries is None):
@@ -3912,7 +3874,7 @@ def generate(txt, minimum, maximum, found_entries=None):
    # Submit input text to generator
    try:
        start_time = time.time()
-        genout, already_generated = tpool.execute(model.core_generate, txt, found_entries)
+        genout, already_generated = tpool.execute(model.core_generate, txt, found_entries, gen_mode=gen_mode)
        logger.debug("Generate: core_generate time {}s".format(time.time()-start_time))
    except Exception as e:
        if(issubclass(type(e), lupa.LuaError)):
@@ -3927,7 +3889,10 @@ def generate(txt, minimum, maximum, found_entries=None):
            emit('from_server', {'cmd': 'errmsg', 'data': 'Error occurred during generator call; please check console.'}, broadcast=True, room="UI_1")
            logger.error(traceback.format_exc().replace("\033", ""))
            socketio.emit("error", str(e), broadcast=True, room="UI_2")
+
        set_aibusy(0)
+        # Clean up token stream
+        emit("stream_tokens", None, broadcast=True, room="UI_2")
        return

    for i in range(koboldai_vars.numseqs):
@@ -3959,7 +3924,10 @@ def generate(txt, minimum, maximum, found_entries=None):
        del genout
        gc.collect()
        torch.cuda.empty_cache()
-    
+
+    # Clean up token stream
+    emit("stream_tokens", None, broadcast=True, room="UI_2")
+
    maybe_review_story()

    set_aibusy(0)
@@ -4428,8 +4396,8 @@ def requestwi():
 #  and items in different folders are sorted based on the order of the folders
 #==================================================================#
 def stablesortwi():
-    mapping = {uid: index for index, uid in enumerate(koboldai_vars.wifolders_l)}
-    koboldai_vars.worldinfo.sort(key=lambda x: mapping[str(x["folder"])] if x["folder"] is not None else float("inf"))
+    mapping = {int(uid): index for index, uid in enumerate(koboldai_vars.wifolders_l)}
+    koboldai_vars.worldinfo.sort(key=lambda x: mapping[int(x["folder"])] if x["folder"] is not None else float("inf"))
    last_folder = ...
    last_wi = None
    for i, wi in enumerate(koboldai_vars.worldinfo):
@@ -5134,9 +5102,13 @@ def load_story_v1(js, from_file=None):
 def load_story_v2(js, from_file=None):
    logger.debug("Loading V2 Story")
    logger.debug("Called from {}".format(inspect.stack()[1].function))
-    leave_room(session['story'])
-    session['story'] = js['story_name']
-    join_room(session['story'])
+
+    new_story = js["story_name"]
+    # In socket context
+    if hasattr(request, "sid"):
+        leave_room(session['story'])
+        join_room(new_story)
+    session['story'] = new_story
    
    koboldai_vars.load_story(session['story'], js)
    
@@ -5564,6 +5536,7 @@ def lite_html():
 #==================================================================#
 # UI V2 CODE
 #==================================================================#
+@app.route('/')
@app.route('/new_ui')
@require_allowed_ip
@logger.catch
@@ -6149,6 +6122,7 @@ def UI_2_Set_Selected_Text(data):
@socketio.on('Use Option Text')
@logger.catch
 def UI_2_Use_Option_Text(data):
+    koboldai_vars.actions.show_options(False)
    if koboldai_vars.prompt == "":
        koboldai_vars.prompt = koboldai_vars.actions.get_current_options()[int(data['option'])]['text']
        koboldai_vars.actions.clear_unused_options()
@@ -6169,23 +6143,31 @@ def UI_2_delete_option(data):
@socketio.on('submit')
@logger.catch
 def UI_2_submit(data):
-    if not koboldai_vars.noai and data['theme'] != "":
+    if not koboldai_vars.noai and data['theme']:
+        # Random prompt generation
        logger.debug("doing random prompt")
        memory = koboldai_vars.memory
        koboldai_vars.memory = "{}\n\nYou generate the following {} story concept :".format(koboldai_vars.memory, data['theme'])
        koboldai_vars.lua_koboldbridge.feedback = None
        actionsubmit("", force_submit=True, force_prompt_gen=True)
        koboldai_vars.memory = memory
-    else:
-        logger.debug("doing normal input")
-        koboldai_vars.actions.clear_unused_options()
-        koboldai_vars.lua_koboldbridge.feedback = None
-        koboldai_vars.recentrng = koboldai_vars.recentrngm = None
-        if koboldai_vars.actions.action_count == -1:
-            actionsubmit(data['data'], actionmode=koboldai_vars.actionmode)
-        else:
-            actionsubmit(data['data'], actionmode=koboldai_vars.actionmode)
- 
+        return
+
+    logger.debug("doing normal input")
+    koboldai_vars.actions.clear_unused_options()
+    koboldai_vars.lua_koboldbridge.feedback = None
+    koboldai_vars.recentrng = koboldai_vars.recentrngm = None
+
+    gen_mode_name = data.get("gen_mode", None) or "standard"
+    try:
+        gen_mode = GenerationMode(gen_mode_name)
+    except ValueError:
+        # Invalid enum lookup!
+        gen_mode = GenerationMode.STANDARD
+        logger.warning(f"Unknown gen_mode '{gen_mode_name}', using STANDARD! Report this!")
+
+    actionsubmit(data['data'], actionmode=koboldai_vars.actionmode, gen_mode=gen_mode)
+
 #==================================================================#
 # Event triggered when user clicks the submit button
 #==================================================================#
@@ -6279,7 +6261,7 @@ def UI_2_select_model(data):
                #so we'll just go through all the possible loaders
                for model_backend in sorted(
                    model_backends,
-                    key=lambda x: model_backend_module_names[x] in PRIORITIZED_BACKEND_MODULES,
+                    key=lambda x: PRIORITIZED_BACKEND_MODULES.get(model_backend_module_names[x], 0),
                    reverse=True,
                ):
                    if model_backends[model_backend].is_valid(data["name"], data["path"] if 'path' in data else None, data["menu"]):
@@ -6715,11 +6697,18 @@ def UI_2_set_wi_image(uid):
        except FileNotFoundError:
            pass
    else:
-        # Otherwise assign image
-        with open(path, "wb") as file:
-            file.write(data)
+        try:
+            # Otherwise assign image
+            with open(path, "wb") as file:
+                file.write(data)
+        except FileNotFoundError:
+            show_error_notification(
+                "Unable to write image",
+                "Please save the game before uploading images."
+            )
+            return ":(", 500
    koboldai_vars.gamesaved = False
-    return ":)"
+    return ":)", 200

@app.route("/get_wi_image/<int(signed=True):uid>", methods=["GET"])
@require_allowed_ip
@@ -7336,7 +7325,7 @@ def generate_image(prompt: str) -> Optional[Image.Image]:
    if koboldai_vars.img_gen_priority == 4:
        # Check if stable-diffusion-webui API option selected and use that if found.
        return text2img_api(prompt)
-    elif ((not koboldai_vars.hascuda or not os.path.exists("models/stable-diffusion-v1-4")) and koboldai_vars.img_gen_priority != 0) or  koboldai_vars.img_gen_priority == 3:
+    elif ((not koboldai_vars.hascuda or not os.path.exists("functional_models/stable-diffusion")) and koboldai_vars.img_gen_priority != 0) or  koboldai_vars.img_gen_priority == 3:
        # If we don't have a GPU, use horde if we're allowed to
        return text2img_horde(prompt)

@@ -7362,7 +7351,7 @@ def text2img_local(prompt: str) -> Optional[Image.Image]:
    logger.debug("Generating Image")
    from diffusers import StableDiffusionPipeline
    if koboldai_vars.image_pipeline is None:
-        pipe = tpool.execute(StableDiffusionPipeline.from_pretrained, "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, cache="functional_models/stable-diffusion").to("cuda")
+        pipe = tpool.execute(StableDiffusionPipeline.from_pretrained, "XpucT/Deliberate", safety_checker=None, torch_dtype=torch.float16, cache="functional_models/stable-diffusion").to("cuda")
    else:
        pipe = koboldai_vars.image_pipeline.to("cuda")
    logger.debug("time to load: {}".format(time.time() - start_time))
@@ -7784,9 +7773,16 @@ def UI_2_update_tokens(data):
 def UI_2_privacy_mode(data):
    if data['enabled']:
        koboldai_vars.privacy_mode = True
+        return
+
+    if data['password'] == koboldai_vars.privacy_password:
+        koboldai_vars.privacy_mode = False
    else:
-        if data['password'] == koboldai_vars.privacy_password:
-            koboldai_vars.privacy_mode = False
+        logger.warning("Watch out! Someone tried to unlock your instance with an incorrect password! Stay on your toes...")
+        show_error_notification(
+            title="Invalid password",
+            text="The password you provided was incorrect. Please try again."
+        )

 #==================================================================#
 # Genres
@@ -8236,6 +8232,7 @@ class WorldInfoUIDsSchema(WorldInfoEntriesUIDsSchema):

 class ModelSelectionSchema(KoboldSchema):
    model: str = fields.String(required=True, validate=validate.Regexp(r"^(?!\s*NeoCustom)(?!\s*GPT2Custom)(?!\s*TPUMeshTransformerGPTJ)(?!\s*TPUMeshTransformerGPTNeoX)(?!\s*GooseAI)(?!\s*OAI)(?!\s*InferKit)(?!\s*Colab)(?!\s*API).*$"), metadata={"description": 'Hugging Face model ID, the path to a model folder (relative to the "models" folder in the KoboldAI root folder) or "ReadOnly" for no model'})
+    backend: Optional[str] = fields.String(required=False, validate=validate.OneOf(model_backends.keys()))

 def _generate_text(body: GenerationInputSchema):
    if koboldai_vars.aibusy or koboldai_vars.genseqs:
@@ -8493,6 +8490,7 @@ def put_model(body: ModelSelectionSchema):
      summary: Load a model
      description: |-2
        Loads a model given its Hugging Face model ID, the path to a model folder (relative to the "models" folder in the KoboldAI root folder) or "ReadOnly" for no model.
+        Optionally, a backend parameter can be passed in to dictate which backend loads the model.
      tags:
        - model
      requestBody:
@@ -8502,6 +8500,7 @@ def put_model(body: ModelSelectionSchema):
            schema: ModelSelectionSchema
            example:
              model: ReadOnly
+              backend: Read Only
      responses:
        200:
          description: Successful request
@@ -8519,8 +8518,18 @@ def put_model(body: ModelSelectionSchema):
    set_aibusy(1)
    old_model = koboldai_vars.model
    koboldai_vars.model = body.model.strip()
+
+    backend = getattr(body, "backend", None)
+    if not backend:
+        # Backend is optional for backwards compatibility; it should probably be
+        # required on the next major API version.
+        if body.model == "ReadOnly":
+            backend = "Read Only"
+        else:
+            backend = "Huggingface"
+
    try:
-        load_model(use_breakmodel_args=True, breakmodel_args_default_to_cpu=True)
+        load_model(backend)
    except Exception as e:
        koboldai_vars.model = old_model
        raise e
@@ -8808,8 +8817,14 @@ def get_story():
    chunks = []
    if koboldai_vars.gamestarted:
        chunks.append({"num": 0, "text": koboldai_vars.prompt})
-    for num, action in koboldai_vars.actions.items():
-        chunks.append({"num": num + 1, "text": action})
+
+    last_action_num = list(koboldai_vars.actions.actions.keys())[-1]
+    for num, action in koboldai_vars.actions.actions.items():
+        text = action["Selected Text"]
+        # The last action seems to always be empty
+        if not text and num == last_action_num:
+            continue
+        chunks.append({"num": num + 1, "text": text})
    return {"results": chunks}


@@ -8833,7 +8848,7 @@ def get_story_nums():
    chunks = []
    if koboldai_vars.gamestarted:
        chunks.append(0)
-    for num in koboldai_vars.actions.keys():
+    for num in koboldai_vars.actions.actions.keys():
        chunks.append(num + 1)
    return {"results": chunks}

@@ -9194,7 +9209,7 @@ def get_world_info():
            if wi["folder"] != last_folder:
                folder = []
                if wi["folder"] is not None:
-                    folders.append({"uid": wi["folder"], "name": koboldai_vars.wifolders_d[wi["folder"]]["name"], "entries": folder})
+                    folders.append({"uid": wi["folder"], "name": koboldai_vars.wifolders_d[str(wi["folder"])]["name"], "entries": folder})
                last_folder = wi["folder"]
            (folder if wi["folder"] is not None else entries).append({k: v for k, v in wi.items() if k not in ("init", "folder", "num") and (wi["selective"] or k != "keysecondary")})
    return {"folders": folders, "entries": entries}
@@ -10905,8 +10920,8 @@ def run():
                if not koboldai_vars.use_colab_tpu and args.model:
                    # If we're using a TPU our UI will freeze during the connection to the TPU. To prevent this from showing to the user we 
                    # delay the display of this message until after that step
-                    logger.message(f"KoboldAI is still loading your model but available at the following link for UI 1: {cloudflare}")
-                    logger.message(f"KoboldAI is still loading your model but available at the following link for UI 2: {cloudflare}/new_ui")
+                    logger.message(f"KoboldAI is still loading your model but available at the following link: {cloudflare}")
+                    logger.message(f"KoboldAI is still loading your model but available at the following link for the Classic UI: {cloudflare}/classic")
                    logger.message(f"KoboldAI is still loading your model but available at the following link for KoboldAI Lite: {cloudflare}/lite")
                    logger.message(f"KoboldAI is still loading your model but available at the following link for the API: [Loading Model...]")
                    logger.message(f"While the model loads you can use the above links to begin setting up your session, for generations you must wait until after its done loading.")
--- a/colab/GPU.ipynb
+++ b/colab/GPU.ipynb
@@ -80,7 +80,7 @@
        "#@title <b><-- Select your model below and then click this to start KoboldAI</b>\n",
        "#@markdown You can find a description of the models below along with instructions on how to start KoboldAI.\n",
        "\n",
-        "Model = \"Nerys V2 6B\" #@param [\"Nerys V2 6B\", \"Skein 6B\", \"Janeway 6B\", \"Adventure 6B\", \"Nerys 2.7B\", \"AID 2.7B\", \"Janeway 2.7B\", \"Picard 2.7B\", \"OPT 2.7B\", \"Fairseq Dense 2.7B\", \"Neo 2.7B\"] {allow-input: true}\n",
+        "Model = \"Nerys V2 6B\" #@param [\"MythoMax 13B (United)\", \"Huginn 13B (United)\", \"Chronos 13B (United)\", \"Airoboros M2.0 13B (United)\", \"Holodeck 13B (United)\", \"Spring Dragon 13B (United)\", \"Nerys V2 6B\", \"Skein 6B\", \"Janeway 6B\", \"Adventure 6B\", \"Nerys 2.7B\", \"AID 2.7B\", \"Janeway 2.7B\", \"Picard 2.7B\", \"OPT 2.7B\", \"Fairseq Dense 2.7B\", \"Neo 2.7B\"] {allow-input: true}\n",
        "Version = \"Official\" #@param [\"Official\", \"United\"] {allow-input: true}\n",
        "Provider = \"Cloudflare\" #@param [\"Localtunnel\", \"Cloudflare\"]\n",
        "use_google_drive = True #@param {type:\"boolean\"}\n",
@@ -146,6 +146,36 @@
        "  Model = \"EleutherAI/gpt-neo-2.7B\"\n",
        "  path = \"\"\n",
        "  download = \"\"\n",
+        "elif Model == \"Huginn 13B (United)\":\n",
+        "  Model = \"The-Face-Of-Goonery/Huginn-13b-v1.2\"\n",
+        "  path = \"\"\n",
+        "  download = \"\"\n",
+        "  Version = \"United\"\n",
+        "elif Model == \"Chronos 13B (United)\":\n",
+        "  Model = \"elinas/chronos-13b-v2\"\n",
+        "  path = \"\"\n",
+        "  download = \"\"\n",
+        "  Version = \"United\"\n",
+        "elif Model == \"Airoboros M2.0 13B (United)\":\n",
+        "  Model = \"jondurbin/airoboros-l2-13b-gpt4-m2.0\"\n",
+        "  path = \"\"\n",
+        "  download = \"\"\n",
+        "  Version = \"United\"\n",
+        "elif Model == \"MythoMax 13B (United)\":\n",
+        "  Model = \"Gryphe/MythoMax-L2-13b\"\n",
+        "  path = \"\"\n",
+        "  download = \"\"\n",
+        "  Version = \"United\"\n",
+        "elif Model == \"Spring Dragon 13B (United)\":\n",
+        "  Model = \"Henk717/spring-dragon\"\n",
+        "  path = \"\"\n",
+        "  download = \"\"\n",
+        "  Version = \"United\"\n",
+        "elif Model == \"Holodeck 13B (United)\":\n",
+        "  Model = \"KoboldAI/LLAMA2-13B-Holodeck-1\"\n",
+        "  path = \"\"\n",
+        "  download = \"\"\n",
+        "  Version = \"United\"\n",
        "\n",
        "if Provider == \"Localtunnel\":\n",
        "  tunnel = \"--localtunnel yes\"\n",
@@ -193,6 +223,20 @@
      "metadata": {
        "id": "Lrm840I33hkC"
      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title <b>Model Cleaner</b>\n",
+        "#@markdown Out of space? Run this to remove all cached models (Google Drive models are not effected).\n",
+        "!rm -rf /content/KoboldAI-Client/cache/*\n"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "5k8fK4F6UiTs"
+      },
+      "execution_count": null,
+      "outputs": []
    }
  ]
 }
--- a/environments/huggingface.yml
+++ b/environments/huggingface.yml
@@ -47,10 +47,10 @@ dependencies:
    - pydub
    - diffusers
    - git+https://github.com/0cc4m/hf_bleeding_edge/
-    - --find-links=https://0cc4m.github.io/GPTQ-for-LLaMa/gptq-whl-links.html
-    - gptq_koboldai==0.0.6
+    - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
+    - https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/0.0.6/gptq_koboldai-0.0.6-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
+    - https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu118-cp38-cp38-linux_x86_64.whl; sys_platform == 'linux'
+    - https://github.com/henk717/KoboldAI/releases/download/Snapshot-11-08-23/auto_gptq-0.4.1+cu118-cp38-cp38-win_amd64.whl; sys_platform == 'win32'
    - einops
    - peft==0.3.0
    - scipy
-    - --find-links=https://0cc4m.github.io/exllama/exllama-whl-links.html
-    - exllama==0.0.6
--- a/gensettings.py
+++ b/gensettings.py
@@ -7,7 +7,7 @@ gensettingstf = [
 	"min": 16,
 	"max": 512,
 	"step": 2,
-	"default": 80,
+	"default": 200,
    "tooltip": "Number of tokens to be generated. Higher values will take longer to generate.",
    "menu_path": "Settings",
    "sub_path":  "Generation",
@@ -182,9 +182,9 @@ gensettingstf = [
 	"label": "Context Tokens",
 	"id": "settknmax", 
 	"min": 512,
-	"max": 2048,
+	"max": 4096,
 	"step": 8,
-	"default": 1024,
+	"default": 2048,
    "tooltip": "Number of context tokens to submit to the AI for sampling. Make sure this is higher than Output Length. Higher values increase VRAM/RAM usage.",
    "menu_path": "Settings",
    "sub_path":  "Generation",
@@ -296,7 +296,7 @@ gensettingstf = [
 	"max": 1,
 	"step": 1,
 	"default": 0,
-    "tooltip": "Scans the AI's output for World Info keys as it is generating the one.",
+    "tooltip": "Look for World Info keys in the AI's response while it is still being generated.",
    "menu_path": "World Info",
    "sub_path": "",
    "classname": "story",
@@ -413,6 +413,23 @@ gensettingstf = [
    ,
    "ui_level": 2
 	},
+    {
+    "UI_V2_Only": True,
+ 	"uitype": "toggle",
+ 	"unit": "bool",
+ 	"label": "Smooth Streaming",
+ 	"id": "smoothstreaming",
+ 	"min": 0,
+ 	"max": 1,
+ 	"step": 1,
+ 	"default": 0,
+	"tooltip": "Makes Token Streaming type in characters, not tokens. Note that this is purely visual, and will likely increase delay in seeing the tokens.",
+    "menu_path": "Interface",
+    "sub_path": "UI",
+    "classname": "user",
+    "name": "smooth_streaming",
+    "ui_level": 1
+ 	},
    {
 	"uitype": "toggle",
 	"unit": "bool",
@@ -739,7 +756,7 @@ gensettingstf = [
 	"max": 1,
 	"step": 1,
 	"default": 0,
-	"tooltip": "If enabled, experimental features will be displayed in the UI.",
+	"tooltip": "If enabled, experimental features will be displayed in the UI. Note: These features have been determined to be too unstable for standard use, and may corrupt your data. You're on your own from here.",
    "menu_path": "Interface",
    "sub_path": "UI",
    "classname": "system",
--- a/koboldai_settings.py
+++ b/koboldai_settings.py
@@ -6,7 +6,7 @@ import os, re, time, threading, json, pickle, base64, copy, tqdm, datetime, sys
 import shutil
 from typing import List, Union
 from io import BytesIO
-from flask import has_request_context, session
+from flask import has_request_context, session, request
 from flask_socketio import join_room, leave_room
 from collections import OrderedDict
 import multiprocessing
@@ -130,11 +130,14 @@ class koboldai_vars(object):
        original_story_name = story_name
        if not multi_story:
            story_name = 'default'
-        #Leave the old room and join the new one
-        logger.debug("Leaving room {}".format(session['story']))
-        leave_room(session['story'])
-        logger.debug("Joining room {}".format(story_name))
-        join_room(story_name)
+
+        # Leave the old room and join the new one if in socket context
+        if hasattr(request, "sid"):
+            logger.debug("Leaving room {}".format(session['story']))
+            leave_room(session['story'])
+            logger.debug("Joining room {}".format(story_name))
+            join_room(story_name)
+
        session['story'] = story_name
        logger.debug("Sending story reset")
        self._story_settings[story_name]._socketio.emit("reset_story", {}, broadcast=True, room=story_name)
@@ -653,7 +656,7 @@ class model_settings(settings):
                         'welcome', 'welcome_default', 'simple_randomness', 'simple_creativity', 'simple_repitition',
                         'badwordsids', 'uid_presets', 'model', 'model_type', 'lazy_load', 'fp32_model', 'modeldim', 'horde_wait_time', 'horde_queue_position', 'horde_queue_size', 'newlinemode', 'tqdm_progress', 'tqdm_rem_time', '_tqdm']
    settings_name = "model"
-    default_settings = {"rep_pen" : 1.1, "rep_pen_slope": 0.7, "rep_pen_range": 1024, "temp": 0.5, "top_p": 0.9, "top_k": 0, "top_a": 0.0, "tfs": 1.0, "typical": 1.0,
+    default_settings = {"rep_pen" : 1.1, "rep_pen_slope": 1.0, "rep_pen_range": 2048, "temp": 0.5, "top_p": 0.9, "top_k": 0, "top_a": 0.0, "tfs": 1.0, "typical": 1.0,
                        "sampler_order": [6,0,1,2,3,4,5]}
    def __init__(self, socketio, koboldai_vars):
        self.enable_whitelist = False
@@ -677,7 +680,7 @@ class model_settings(settings):
        <div id='welcome-logo-container'><img id='welcome-logo' src='static/Welcome_Logo.png' draggable='False'></div>
        <div class='welcome_text'>
            <div id="welcome-text-content">Please load a model from the left.<br/>
-                If you encounter any issues, please click the Download debug dump link in the Home tab on the left flyout and attach the downloaded file to your error report on <a href='https://github.com/ebolam/KoboldAI/issues'>Github</a>, <a href='https://www.reddit.com/r/KoboldAI/'>Reddit</a>, or <a href='https://discord.gg/XuQWadgU9k'>Discord</a>.
+                If you encounter any issues, please click the Download debug dump link in the Home tab on the left flyout and attach the downloaded file to your error report on <a href='https://github.com/ebolam/KoboldAI/issues'>Github</a>, <a href='https://www.reddit.com/r/KoboldAI/'>Reddit</a>, or <a href='https://koboldai.org/discord'>Discord</a>.
                A redacted version (without story text) is available.
            </div>
        </div>""" # Custom Welcome Text
@@ -685,18 +688,19 @@ class model_settings(settings):
        self._koboldai_vars = koboldai_vars
        self.alt_multi_gen = False
        self.bit_8_available = None
+        self.supported_gen_modes = []
        
    def reset_for_model_load(self):
        self.simple_randomness = 0 #Set first as this affects other outputs
        self.simple_creativity = 0 #Set first as this affects other outputs
        self.simple_repitition = 0 #Set first as this affects other outputs
-        self.max_length  = 1024    # Maximum number of tokens to submit per action
+        self.max_length  = 2048    # Maximum number of tokens to submit per action
        self.ikmax       = 3000    # Maximum number of characters to submit to InferKit
-        self.genamt      = 80      # Amount of text for each action to generate
+        self.genamt      = 200      # Amount of text for each action to generate
        self.ikgen       = 200     # Number of characters for InferKit to generate
        self.rep_pen     = 1.1     # Default generator repetition_penalty
-        self.rep_pen_slope = 0.7   # Default generator repetition penalty slope
-        self.rep_pen_range = 1024  # Default generator repetition penalty range
+        self.rep_pen_slope = 1.0   # Default generator repetition penalty slope
+        self.rep_pen_range = 2048  # Default generator repetition penalty range
        self.temp        = 0.5     # Default generator temperature
        self.top_p       = 0.9     # Default generator top_p
        self.top_k       = 0       # Default generator top_k
@@ -1155,6 +1159,7 @@ class user_settings(settings):
        self.nogenmod    = False
        self.debug       = False    # If set to true, will send debug information to the client for display
        self.output_streaming = True
+        self.smooth_streaming = True
        self.show_probs = False # Whether or not to show token probabilities
        self.beep_on_complete = False
        self.img_gen_priority = 1
@@ -1755,11 +1760,15 @@ class KoboldStoryRegister(object):
    
    def go_forward(self):
        action_step = self.action_count+1
-        if action_step in self.actions:
-            if len(self.get_current_options()) == 1:
-                logger.warning("Going forward with this text: {}".format(self.get_current_options()[0]["text"]))
-                self.use_option([x['text'] for x in self.actions[action_step]["Options"]].index(self.get_current_options()[0]["text"]))
-    
+        if action_step not in self.actions:
+            return
+
+        self.show_options(len(self.get_current_options()) > 1)
+
+        if len(self.get_current_options()) == 1:
+            logger.warning("Going forward with this text: {}".format(self.get_current_options()[0]["text"]))
+            self.use_option([x['text'] for x in self.actions[action_step]["Options"]].index(self.get_current_options()[0]["text"]))
+
    def use_option(self, option_number, action_step=None):
        if action_step is None:
            action_step = self.action_count+1
@@ -1797,6 +1806,16 @@ class KoboldStoryRegister(object):
                process_variable_changes(self._socketio, "story", 'actions', {"id": action_step, 'action':  self.actions[action_step]}, None)
                self.set_game_saved()
    
+    def show_options(
+        self,
+        should_show: bool,
+        force: bool = False,
+
+    ) -> None:
+        if self._koboldai_vars.aibusy and not force:
+            return
+        self._socketio.emit("show_options", should_show, broadcast=True, room="UI_2")
+    
    def delete_action(self, action_id, keep=True):
        if action_id in self.actions:
            old_options = copy.deepcopy(self.actions[action_id]["Options"])
@@ -1889,34 +1908,19 @@ class KoboldStoryRegister(object):
                    process_variable_changes(self._socketio, "story", 'actions', {"id": self.action_count+1, 'action':  self.actions[self.action_count+1]}, None)
        else:
            #We're streaming single options so our output is our selected
-            #First we need to see if this is actually the prompt. If so we'll just not do streaming:
-            if self.story_settings.prompt != "":
-                if self.action_count+1 in self.actions:
-                    if self._koboldai_vars.tokenizer is not None:
-                        selected_text_length = len(self._koboldai_vars.tokenizer.encode(self.actions[self.action_count+1]['Selected Text']))
-                    else:
-                        selected_text_length = 0
-                    self.actions[self.action_count+1]['Selected Text'] = "{}{}".format(self.actions[self.action_count+1]['Selected Text'], text_list[0])
-                    self.actions[self.action_count+1]['Selected Text Length'] = selected_text_length
-                else:
-                    if self._koboldai_vars.tokenizer is not None:
-                        selected_text_length = len(self._koboldai_vars.tokenizer.encode(text_list[0]))
-                    else:
-                        selected_text_length = 0
-                    self.actions[self.action_count+1] = {"Selected Text": text_list[0], "Selected Text Length": selected_text_length, "Options": [], "Time": int(time.time())}
-                
-                
-                
-                if self._koboldai_vars.tokenizer is not None:
-                    if len(self._koboldai_vars.tokenizer.encode(self.actions[self.action_count+1]['Selected Text'])) != self._koboldai_vars.genamt:
-                        #ui1
-                        if queue is not None:
-                            queue.put(["from_server", {"cmd": "streamtoken", "data": [{
-                                "decoded": text_list[0],
-                                "probabilities": self.probability_buffer
-                            }]}, {"broadcast":True, "room":"UI_1"}])
-                        #process_variable_changes(self._socketio, "actions", "Options", {"id": self.action_count+1, "options": self.actions[self.action_count+1]["Options"]}, {"id": self.action_count+1, "options": None})
-                        process_variable_changes(self._socketio, "story", 'actions', {"id": self.action_count+1, 'action':  self.actions[self.action_count+1]}, None)
+            queue.put(["stream_tokens", text_list, {"broadcast": True, "room": "UI_2"}])
+
+            # UI1
+            queue.put([
+                "from_server", {
+                    "cmd": "streamtoken",
+                    "data": [{
+                        "decoded": text_list[0],
+                        "probabilities": self.probability_buffer
+                    }],
+                },
+                {"broadcast":True, "room": "UI_1"}
+            ])
    
    def set_probabilities(self, probabilities, action_id=None):
        self.probability_buffer = probabilities
--- a/modeling/inference_model.py
+++ b/modeling/inference_model.py
@@ -3,6 +3,8 @@ from __future__ import annotations
 from dataclasses import dataclass
 import time
 from typing import List, Optional, Union
+
+from enum import Enum
 from logger import logger

 import torch
@@ -12,6 +14,7 @@ from transformers import (
    GPT2Tokenizer,
    AutoTokenizer,
 )
+from modeling.stoppers import Stoppers
 from modeling.tokenizer import GenericTokenizer
 from modeling import logits_processors

@@ -144,7 +147,10 @@ class GenerationSettings:
 class ModelCapabilities:
    embedding_manipulation: bool = False
    post_token_hooks: bool = False
+
+    # Used to gauge if manual stopping is possible
    stopper_hooks: bool = False
+
    # TODO: Support non-live probabilities from APIs
    post_token_probs: bool = False

@@ -154,6 +160,12 @@ class ModelCapabilities:
    # Some models need to warm up the TPU before use
    uses_tpu: bool = False

+class GenerationMode(Enum):
+    STANDARD = "standard"
+    FOREVER = "forever"
+    UNTIL_EOS = "until_eos"
+    UNTIL_NEWLINE = "until_newline"
+    UNTIL_SENTENCE_END = "until_sentence_end"

 class InferenceModel:
    """Root class for all models."""
@@ -256,6 +268,7 @@ class InferenceModel:
        self,
        text: list,
        found_entries: set,
+        gen_mode: GenerationMode = GenerationMode.STANDARD,
    ):
        """Generate story text. Heavily tied to story-specific parameters; if
        you are making a new generation-based feature, consider `generate_raw()`.
@@ -263,6 +276,7 @@ class InferenceModel:
        Args:
            text (list): Encoded input tokens
            found_entries (set): Entries found for Dynamic WI
+            gen_mode (GenerationMode): The GenerationMode to pass to raw_generate. Defaults to GenerationMode.STANDARD

        Raises:
            RuntimeError: if inconsistancies are detected with the internal state and Lua state -- sanity check
@@ -358,6 +372,7 @@ class InferenceModel:
                        seed=utils.koboldai_vars.seed
                        if utils.koboldai_vars.full_determinism
                        else None,
+                        gen_mode=gen_mode
                    )
                    logger.debug(
                        "core_generate: run raw_generate pass {} {}s".format(
@@ -532,6 +547,7 @@ class InferenceModel:
        found_entries: set = (),
        tpu_dynamic_inference: bool = False,
        seed: Optional[int] = None,
+        gen_mode: GenerationMode = GenerationMode.STANDARD,
        **kwargs,
    ) -> GenerationResult:
        """A wrapper around `_raw_generate()` that handles gen_state and other stuff. Use this to generate text outside of the story.
@@ -547,6 +563,7 @@ class InferenceModel:
            is_core (bool, optional): Whether this generation is a core story generation. Defaults to False.
            single_line (bool, optional): Generate one line only.. Defaults to False.
            found_entries (set, optional): Entries found for Dynamic WI. Defaults to ().
+            gen_mode (GenerationMode): Special generation mode. Defaults to GenerationMode.STANDARD.

        Raises:
            ValueError: If prompt type is weird
@@ -568,6 +585,29 @@ class InferenceModel:
            "wi_scanner_excluded_keys", set()
        )

+        self.gen_state["allow_eos"] = False
+
+        temp_stoppers = []
+
+        if gen_mode not in self.get_supported_gen_modes():
+            gen_mode = GenerationMode.STANDARD
+            logger.warning(f"User requested unsupported GenerationMode '{gen_mode}'!")
+
+        if gen_mode == GenerationMode.FOREVER:
+            self.gen_state["stop_at_genamt"] = False
+            max_new = 1e7
+        elif gen_mode == GenerationMode.UNTIL_EOS:
+            self.gen_state["allow_eos"] = True
+            self.gen_state["stop_at_genamt"] = False
+            max_new = 1e7
+        elif gen_mode == GenerationMode.UNTIL_NEWLINE:
+            # TODO: Look into replacing `single_line` with `generation_mode`
+            temp_stoppers.append(Stoppers.newline_stopper)
+        elif gen_mode == GenerationMode.UNTIL_SENTENCE_END:
+            temp_stoppers.append(Stoppers.sentence_end_stopper)
+
+        self.stopper_hooks += temp_stoppers
+
        utils.koboldai_vars.inference_config.do_core = is_core
        gen_settings = GenerationSettings(*(generation_settings or {}))

@@ -597,13 +637,21 @@ class InferenceModel:
            )

        time_end = round(time.time() - time_start, 2)
-        tokens_per_second = round(len(result.encoded[0]) / time_end, 2)
+
+        try:
+            tokens_per_second = round(len(result.encoded[0]) / time_end, 2)
+        except ZeroDivisionError:
+            # Introducing KoboldAI's fastest model: ReadOnly!
+            tokens_per_second = 0

        if not utils.koboldai_vars.quiet:
            logger.info(
                f"Generated {len(result.encoded[0])} tokens in {time_end} seconds, for an average rate of {tokens_per_second} tokens per second."
            )

+        for stopper in temp_stoppers:
+            self.stopper_hooks.remove(stopper)
+
        return result

    def generate(
@@ -620,3 +668,19 @@ class InferenceModel:
    def _post_token_gen(self, input_ids: torch.LongTensor) -> None:
        for hook in self.post_token_hooks:
            hook(self, input_ids)
+
+    def get_supported_gen_modes(self) -> List[GenerationMode]:
+        """Returns a list of compatible `GenerationMode`s for the current model.
+
+        Returns:
+            List[GenerationMode]: A list of compatible `GenerationMode`s.
+        """
+        ret = [GenerationMode.STANDARD]
+
+        if self.capabilties.stopper_hooks:
+            ret += [
+                GenerationMode.FOREVER,
+                GenerationMode.UNTIL_NEWLINE,
+                GenerationMode.UNTIL_SENTENCE_END,
+            ]
+        return ret
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -27,6 +27,10 @@ model_backend_name = "Huggingface"
 model_backend_type = "Huggingface" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)

 class model_backend(HFTorchInferenceModel):
+    def __init__(self) -> None:
+        super().__init__()
+        self.use_4_bit = False
+
    def is_valid(self, model_name, model_path, menu_path):
        base_is_valid = super().is_valid(model_name, model_path, menu_path)
        path = False
@@ -58,15 +62,15 @@ class model_backend(HFTorchInferenceModel):
                                            "unit": "text",
                                            "label": "Quantization",
                                            "id": "quantization",
-                                            "default": temp['quantization'] if 'quantization' in temp else 'none',
+                                            "default": temp['quantization'] if 'quantization' in temp else '4bit' if dependency_exists else '16-bit',
                                            "tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode",
                                            "menu_path": "Layers",
-                                            "children": [{'text': 'None', 'value':'none'},{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}],
+                                            "children": [{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}, {'text': '16-bit', 'value':'16-bit'}],
                                            "extra_classes": "",
                                            "refresh_model_inputs": False
                                        })
        else:
-            logger.warning("Bitsandbytes is not installed, you can not use Huggingface models in 4-bit")
+            logger.warning("Bitsandbytes is not installed, you can not use Quantization for Huggingface models")
        return requested_parameters
 
    def set_input_parameters(self, parameters):
@@ -124,7 +128,8 @@ class model_backend(HFTorchInferenceModel):
            # We must disable low_cpu_mem_usage and if using a GPT-2 model
            # because GPT-2 is not compatible with this feature yet.
            tf_kwargs.pop("low_cpu_mem_usage", None)
-
+            tf_kwargs.pop("quantization_config", None)
+            
            # Also, lazy loader doesn't support GPT-2 models
            self.lazy_load = False

--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -7,7 +7,7 @@ import torch
 import re
 import shutil
 import sys
-from typing import Union
+from typing import Dict, Union

 import utils
 import modeling.lazy_loader as lazy_loader
@@ -82,13 +82,109 @@ def get_gptq_version(fpath):
            logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
        return 0, False

+def load_quant_offload_device_map(
+    load_quant_func, model, checkpoint, wbits, groupsize, device_map, offload_type=0, force_bias=False,
+):
+    from gptq.offload import (
+        find_layers,
+        llama_offload_forward,
+        gptneox_offload_forward,
+        gptj_offload_forward,
+        opt_offload_forward,
+        bigcode_offload_forward
+    )
+    from transformers.models.llama.modeling_llama import LlamaModel
+    from transformers.models.opt.modeling_opt import OPTModel
+    from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel
+    from transformers.models.gptj.modeling_gptj import GPTJModel
+    from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel
+    model = load_quant_func(model, checkpoint, wbits, groupsize, force_bias=force_bias)
+
+    m, layers, remaining = find_layers(model)
+    type(m).non_offload_forward = type(m).forward
+
+    # Hook offload_forward into found model
+    if type(m) == LlamaModel:
+        type(m).forward = llama_offload_forward
+    elif type(m) == GPTNeoXModel:
+        type(m).forward = gptneox_offload_forward
+    elif type(m) == GPTJModel:
+        type(m).forward = gptj_offload_forward
+    elif type(m) == OPTModel:
+        type(m).forward = opt_offload_forward
+    elif type(m) == GPTBigCodeModel:
+        type(m).forward = bigcode_offload_forward
+    else:
+        raise RuntimeError(f"Model type {type(m)} not supported by CPU offloader")
+
+    layers_done = len([1 for v in device_map.values() if v != "cpu"])
+
+    m.cpu_device = torch.device("cpu")
+    m.fast_offload = layers_done > len(layers) // 2
+    m.layer_count = len(layers)
+    m.cpu_layers = len(layers) - layers_done
+    m.gpu_layers = layers_done
+    m.offload_type = offload_type
+    # HACK
+    m.primary_gpu = list(device_map.values())[0]
+
+    if "layers" not in dir(m):
+        m.layers = layers
+
+    for i in range(len(layers)):
+        dev = None
+        for key, device in device_map.items():
+            key = int(*[x for x in key.split(".") if x.isdecimal()])
+            if key == i:
+                dev = device
+                break
+        if dev is None:
+            raise ValueError
+        layers[key].to(dev, torch.float16, False)
+
+    for module in remaining:
+        module.to(m.primary_gpu)
+
+    return model
+

 class model_backend(HFTorchInferenceModel):
    def is_valid(self, model_name, model_path, menu_path):
        gptq_model, _, _, _, _ = load_model_gptq_settings(model_path)
        return bool(gptq_model)

+    def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+        requested_parameters = super().get_requested_parameters(model_name, model_path, menu_path, parameters)
+        if model_name != 'customhuggingface' or "custom_model_name" in parameters:
+            if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
+                with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
+                    temp = json.load(f)
+            else:
+                temp = {}
+            requested_parameters.append({
+                                        "uitype": "dropdown",
+                                        "unit": "text",
+                                        "label": "Implementation",
+                                        "id": "implementation",
+                                        "default": temp['implementation'] if 'implementation' in temp else 'occam',
+                                        "tooltip": "Which GPTQ provider to use?",
+                                        "menu_path": "Layers",
+                                        "children": [{'text': 'Occam GPTQ', 'value': 'occam'}, {'text': 'AutoGPTQ', 'value': 'AutoGPTQ'}],
+                                        "extra_classes": "",
+                                        "refresh_model_inputs": False
+                                    })
+        return requested_parameters
+
+    def set_input_parameters(self, parameters):
+        super().set_input_parameters(parameters)
+        self.implementation = parameters['implementation'] if 'implementation' in parameters else "occam"
+
    def _load(self, save_model: bool, initial_load: bool) -> None:
+        try:
+            from hf_bleeding_edge import AutoModelForCausalLM
+        except ImportError:
+            from transformers import AutoModelForCausalLM
+
        # Make model path the same as the model name to make this consistent
        # with the other loading method if it isn't a known model type. This
        # code is not just a workaround for below, it is also used to make the
@@ -98,7 +194,7 @@ class model_backend(HFTorchInferenceModel):

        self.init_model_config()

-        self.lazy_load = False
+        self.lazy_load = True

        gpulayers = self.breakmodel_config.gpu_blocks

@@ -107,10 +203,6 @@ class model_backend(HFTorchInferenceModel):
        except (ValueError, AttributeError):
            self.gpu_layers_list = [utils.num_layers(self.model_config)]

-        tf_kwargs = {
-            "low_cpu_mem_usage": True,
-        }
-
        # If we're using torch_lazy_loader, we need to get breakmodel config
        # early so that it knows where to load the individual model tensors
        logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
@@ -123,9 +215,6 @@ class model_backend(HFTorchInferenceModel):
            self.breakmodel_device_config(self.model_config)

        if self.lazy_load:
-            # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
-            tf_kwargs.pop("low_cpu_mem_usage", None)
-
            # If we're using lazy loader, we need to figure out what the model's hidden layers are called
            with lazy_loader.use_lazy_load(dematerialized_modules=True):
                try:
@@ -141,7 +230,7 @@ class model_backend(HFTorchInferenceModel):

        if self.get_local_model_path():
            # Model is stored locally, load it.
-            self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
+            self.model = self._get_model(self.get_local_model_path())
            self.tokenizer = self._get_tokenizer(self.get_local_model_path())
        else:
            raise NotImplementedError("GPTQ Model downloading not implemented")
@@ -161,7 +250,58 @@ class model_backend(HFTorchInferenceModel):
        self.model.kai_model = self
        utils.koboldai_vars.modeldim = self.get_hidden_size()

-    def _get_model(self, location: str, tf_kwargs: Dict):
+    def _patch_quant(self, device_map, quant_module) -> None:
+        def make_quant(module, names, bits, groupsize, name='', force_bias=False, **kwargs):
+            if isinstance(module, quant_module.QuantLinear):
+                return
+
+            for attr in dir(module):
+                tmp = getattr(module, attr)
+                name1 = name + '.' + attr if name != '' else attr
+                if name1 in names:
+                    parts = name1.split(".")
+                    device = None
+                    for i in reversed(range(len(parts))):
+                        maybe_key = ".".join(parts[:i])
+                        if maybe_key in device_map:
+                            device = device_map[maybe_key]
+                            break
+
+                    if device is None:
+                        raise ValueError(f"No device for {name1}")
+
+                    delattr(module, attr)
+
+                    ql = quant_module.QuantLinear(
+                        bits,
+                        groupsize,
+                        tmp.in_features,
+                        tmp.out_features,
+                        force_bias or tmp.bias is not None,
+                        **kwargs,
+                    )
+                    ql = ql.to(device)
+
+                    setattr(module, attr, ql)
+
+            for name1, child in module.named_children():
+                make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1, force_bias=force_bias)
+
+        quant_module.make_quant = make_quant
+
+
+    def _patch_quants(self, device_map) -> None:
+        # Load QuantLinears on the device corresponding to the device map
+
+        from gptq import quant_v3
+        from gptq import quant_v2
+        from gptq import quant_v1
+
+        for quant_module in [quant_v3, quant_v2, quant_v1]:
+            self._patch_quant(device_map, quant_module)
+
+
+    def _get_model(self, location: str):
        import gptq
        from gptq.gptj import load_quant as gptj_load_quant
        from gptq.gptneox import load_quant as gptneox_load_quant
@@ -169,7 +309,12 @@ class model_backend(HFTorchInferenceModel):
        from gptq.opt import load_quant as opt_load_quant
        from gptq.bigcode import load_quant as bigcode_load_quant
        from gptq.mpt import load_quant as mpt_load_quant
-        from gptq.offload import load_quant_offload
+
+        try:
+            import hf_bleeding_edge
+            from hf_bleeding_edge import AutoModelForCausalLM
+        except ImportError:
+            from transformers import AutoModelForCausalLM

        gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
        v2_bias = False
@@ -181,50 +326,77 @@ class model_backend(HFTorchInferenceModel):
        model_type = self.get_model_type()

        logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}")
-        if model_type == "gptj":
-            model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "gpt_neox":
-            model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "llama":
-            model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "opt":
-            model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "mpt":
-            model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
-        elif model_type == "gpt_bigcode":
-            model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
-        else:
-            try:
-                import auto_gptq
-                from auto_gptq import AutoGPTQForCausalLM
-            except ImportError:
-                raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")

-            try:
-                import hf_bleeding_edge
-                from hf_bleeding_edge import AutoModelForCausalLM
-            except ImportError:
-                from transformers import AutoModelForCausalLM
+        device_map = {}

-            # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
-            auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
-            auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
-            auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM
-            model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"))
+        if self.lazy_load:
+            with lazy_loader.use_lazy_load(dematerialized_modules=True):
+                metamodel = AutoModelForCausalLM.from_config(self.model_config)
+                if utils.args.cpu:
+                    device_map = {name: "cpu" for name in utils.layers_module_names}
+                    for name in utils.get_missing_module_names(
+                        metamodel, list(device_map.keys())
+                    ):
+                        device_map[name] = "cpu"
+                else:
+                    device_map = self.breakmodel_config.get_device_map(
+                        metamodel
+                    )

-            # Patch in embeddings function
-            def get_input_embeddings(self):
-                return self.model.get_input_embeddings()
+        self._patch_quants(device_map)

-            type(model).get_input_embeddings = get_input_embeddings
+        with lazy_loader.use_lazy_load(
+            enable=self.lazy_load,
+            dematerialized_modules=False,
+        ):
+            if self.implementation == "occam":
+                try:
+                    if model_type == "gptj":
+                        model = load_quant_offload_device_map(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
+                    elif model_type == "gpt_neox":
+                        model = load_quant_offload_device_map(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
+                    elif model_type == "llama":
+                        model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
+                    elif model_type == "opt":
+                        model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
+                    elif model_tseype == "mpt":
+                        model = load_quant_offload_device_map(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
+                    elif model_type == "gpt_bigcode":
+                        model = load_quant_offload_device_map(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias).half()
+                    else:
+                        raise RuntimeError("Model not supported by Occam's GPTQ")
+                except:
+                    self.implementation = "AutoGPTQ"
+            if self.implementation == "AutoGPTQ":
+                try:
+                    import auto_gptq
+                    from auto_gptq import AutoGPTQForCausalLM
+                except ImportError:
+                    raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")

-            # Patch in args support..
-            def generate(self, *args, **kwargs):
-                """shortcut for model.generate"""
-                with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
-                    return self.model.generate(*args, **kwargs)
+                # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
+                auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
+                auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
+                auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM

-            type(model).generate = generate
+                try:
+                    model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"), device_map=device_map)
+                except:
+                    model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"), device_map=device_map, disable_exllama=True)
+
+                # Patch in embeddings function
+                def get_input_embeddings(self):
+                    return self.model.get_input_embeddings()
+
+                type(model).get_input_embeddings = get_input_embeddings
+
+                # Patch in args support..
+                def generate(self, *args, **kwargs):
+                    """shortcut for model.generate"""
+                    with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
+                        return self.model.generate(*args, **kwargs)
+
+                type(model).generate = generate

        return model

--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -19,8 +19,12 @@ class HFInferenceModel(InferenceModel):
    def __init__(self) -> None:
        super().__init__()
        self.model_config = None
-        #self.model_name = model_name

+        # TODO: model_name should probably be an instantiation parameter all the
+        # way down the inheritance chain.
+        self.model_name = None
+
+        self.path = None
        self.hf_torch = False
        self.model = None
        self.tokenizer = None
@@ -217,6 +221,11 @@ class HFInferenceModel(InferenceModel):
                torch.cuda.empty_cache()
        except:
            pass
+    
+    def _pre_load(self) -> None:
+        # HACK: Make model instantiation work without UI parameters
+        self.model_name = self.model_name or utils.koboldai_vars.model
+        return super()._pre_load()

    def _post_load(self) -> None:
        self.badwordsids = koboldai_settings.badwordsids_default
--- a/modeling/inference_models/hf_mtj/class.py
+++ b/modeling/inference_models/hf_mtj/class.py
@@ -133,7 +133,8 @@ class model_backend(HFInferenceModel):
            utils.koboldai_vars.compiling = True

        def mtj_stopped_compiling_callback() -> None:
-            print(Colors.GREEN + "TPU backend compilation stopped" + Colors.END)
+            if utils.koboldai_vars.compiling:
+                print(Colors.GREEN + "TPU backend compilation stopped" + Colors.END)
            utils.koboldai_vars.compiling = False

        def mtj_settings_callback() -> dict:
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -34,6 +34,7 @@ from modeling.stoppers import Stoppers
 from modeling.post_token_hooks import PostTokenHooks
 from modeling.inference_models.hf import HFInferenceModel
 from modeling.inference_model import (
+    GenerationMode,
    GenerationResult,
    GenerationSettings,
    ModelCapabilities,
@@ -92,7 +93,11 @@ class HFTorchInferenceModel(HFInferenceModel):
        self.hf_torch = True
        self.lazy_load = True
        self.low_mem = False
+
+        # `nobreakmodel` indicates that breakmodel cannot be used, while `breakmodel`
+        # indicates whether breakmodel is currently being used
        self.nobreakmodel = False
+        self.breakmodel = False

        self.post_token_hooks = [
            PostTokenHooks.stream_tokens,
@@ -126,8 +131,13 @@ class HFTorchInferenceModel(HFInferenceModel):
        return ret

    def get_auxilary_device(self) -> Union[str, int, torch.device]:
-        return self.breakmodel_config.primary_device
-
+        if self.breakmodel:
+            return self.breakmodel_config.primary_device
+        if self.usegpu:
+            return "cuda:0"
+        else:
+            return "cpu"
+        
    def _get_target_dtype(self) -> Union[torch.float16, torch.float32]:
        if self.breakmodel_config.primary_device == "cpu":
            return torch.float32
@@ -228,9 +238,6 @@ class HFTorchInferenceModel(HFInferenceModel):
        )

        class KoboldLogitsWarperList(LogitsProcessorList):
-            def __init__(self):
-                pass
-
            def __call__(
                lw_self,
                input_ids: torch.LongTensor,
@@ -247,17 +254,14 @@ class HFTorchInferenceModel(HFInferenceModel):
                    ), f"Scores are None; processor '{processor}' is to blame"
                return scores

-        def new_get_logits_warper(
-            beams: int = 1,
-        ) -> LogitsProcessorList:
-            return KoboldLogitsWarperList()
-
        def new_sample(self, *args, **kwargs):
            assert kwargs.pop("logits_warper", None) is not None
-            kwargs["logits_warper"] = new_get_logits_warper(
-                beams=1,
-            )
-            if utils.koboldai_vars.newlinemode in ["s", "ns"]:
+            kwargs["logits_warper"] = KoboldLogitsWarperList()
+
+            if (
+                utils.koboldai_vars.newlinemode in ["s", "ns"]
+                and not m_self.gen_state["allow_eos"]
+            ):
                kwargs["eos_token_id"] = -1
                kwargs.setdefault("pad_token_id", 2)
            return new_sample.old_sample(self, *args, **kwargs)
@@ -329,7 +333,7 @@ class HFTorchInferenceModel(HFInferenceModel):
        with torch.no_grad():
            start_time = time.time()
            genout = self.model.generate(
-                gen_in,
+                input_ids=gen_in,
                do_sample=True,
                max_length=min(
                    len(prompt_tokens) + max_new, utils.koboldai_vars.max_length
@@ -608,3 +612,9 @@ class HFTorchInferenceModel(HFInferenceModel):
            self.breakmodel = False
            self.usegpu = False
            return
+
+    def get_supported_gen_modes(self) -> List[GenerationMode]:
+        # This changes a torch patch to disallow eos as a bad word.
+        return super().get_supported_gen_modes() + [
+            GenerationMode.UNTIL_EOS
+        ]
--- a/modeling/inference_models/readonly/class.py
+++ b/modeling/inference_models/readonly/class.py
@@ -1,12 +1,10 @@
 from __future__ import annotations

 import torch
-import requests
 import numpy as np
 from typing import List, Optional, Union

 import utils
-from logger import logger
 from modeling.inference_model import (
    GenerationResult,
    GenerationSettings,
@@ -15,29 +13,46 @@ from modeling.inference_model import (
 )

 model_backend_name = "Read Only"
-model_backend_type = "Read Only" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)
+model_backend_type = "Read Only"  # This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)

-class BasicAPIException(Exception):
-    """To be used for errors when using the Basic API as an interface."""
+
+class DummyHFTokenizerOut:
+    input_ids = np.array([[]])
+
+
+class FacadeTokenizer:
+    def __init__(self):
+        self._koboldai_header = []
+
+    def decode(self, _input):
+        return ""
+
+    def encode(self, input_text):
+        return []
+
+    def __call__(self, *args, **kwargs) -> DummyHFTokenizerOut:
+        return DummyHFTokenizerOut()


 class model_backend(InferenceModel):
    def __init__(self) -> None:
        super().__init__()

-        # Do not allow API to be served over the API
+        # Do not allow ReadOnly to be served over the API
        self.capabilties = ModelCapabilities(api_host=False)
-        self.tokenizer = self._tokenizer()
+        self.tokenizer: FacadeTokenizer = None
        self.model = None
        self.model_name = "Read Only"
-    
+
    def is_valid(self, model_name, model_path, menu_path):
        return model_name == "ReadOnly"
-    
-    def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
+
+    def get_requested_parameters(
+        self, model_name, model_path, menu_path, parameters={}
+    ):
        requested_parameters = []
        return requested_parameters
-        
+
    def set_input_parameters(self, parameters):
        return

@@ -46,17 +61,9 @@ class model_backend(InferenceModel):

    def _initialize_model(self):
        return
-    
-    class _tokenizer():
-        def __init__(self):
-            self._koboldai_header = []
-        def decode(self, _input):
-            return ""
-        def encode(self, input_text):
-            return []

    def _load(self, save_model: bool = False, initial_load: bool = False) -> None:
-        self.tokenizer = self.tokenizer
+        self.tokenizer = FacadeTokenizer()
        self.model = None
        utils.koboldai_vars.noai = True

@@ -72,7 +79,7 @@ class model_backend(InferenceModel):
    ):
        return GenerationResult(
            model=self,
-            out_batches=np.array([]),
+            out_batches=np.array([[]]),
            prompt=prompt_tokens,
            is_whole_generation=True,
            single_line=single_line,
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -51,15 +51,12 @@ import time
 import zipfile
 import pickle
 import torch
-import numpy as np
-import collections
-import _codecs
 import os
 from typing import Any, Callable, Dict, Optional, Tuple, Type

-from torch import Tensor
 from torch.nn import Module
 from torch.storage import UntypedStorage
+from modeling.pickling import RestrictedUnpickler, use_custom_unpickler
 from modeling.patches import LazyloadPatches

 # Safetensors is a dependency for the local version, TPU/Colab doesn't
@@ -176,9 +173,6 @@ class TorchLazyTensor(LazyTensor):
            CheckpointChunkCache.key = self.key
            ziproot = checkpoint.namelist()[0].split("/")[0]
            CheckpointChunkCache.handle = checkpoint.open(f"{ziproot}/data/{self.key}", "r")
-
-                
-                                
        else:
            # Cache hit. Hip hip hooray! :^)
            # print(".", end="", flush=True)
@@ -239,86 +233,11 @@ class SafetensorsLazyTensor(LazyTensor):
            self.checkpoint_file, tensor_key=self.key, device=self.location
        )

-def _patched_rebuild_from_type_v2(func, new_type, args, state):
-    """A patched version of torch._tensor._rebuild_from_type_v2 that
-    does not attempt to convert `LazyTensor`s to `torch.Tensor`s."""
-
-    ret = func(*args)
-
-    # BEGIN PATCH
-    transformation_ok = isinstance(ret, LazyTensor) and new_type == Tensor
-    if type(ret) is not new_type and not transformation_ok:
-    # END PATCH
-        ret = ret.as_subclass(new_type)
-
-    # Tensor does define __setstate__ even though it doesn't define
-    # __getstate__. So only use __setstate__ if it is NOT the one defined
-    # on Tensor
-    if (
-        getattr(ret.__class__, "__setstate__", Tensor.__setstate__)
-        is not Tensor.__setstate__
-    ):
-        ret.__setstate__(state)
-    else:
-        ret = torch._utils._set_obj_state(ret, state)
-    return ret
-
-class RestrictedUnpickler(pickle.Unpickler):
-    def original_persistent_load(self, saved_id):
-        return super().persistent_load(saved_id)
-
-    def forced_persistent_load(self, saved_id):
-        if saved_id[0] != "storage":
-            raise pickle.UnpicklingError("`saved_id[0]` must be 'storage'")
-        return self.original_persistent_load(saved_id)
-
-    def find_class(self, module, name):
-        if module == "collections" and name == "OrderedDict":
-            return collections.OrderedDict
-        elif module == "torch._utils" and name == "_rebuild_tensor_v2":
-            return torch._utils._rebuild_tensor_v2
-        elif module == "torch._tensor" and name == "_rebuild_from_type_v2":
-            return _patched_rebuild_from_type_v2
-        elif module == "torch" and name in (
-            "DoubleStorage",
-            "FloatStorage",
-            "HalfStorage",
-            "LongStorage",
-            "IntStorage",
-            "ShortStorage",
-            "CharStorage",
-            "ByteStorage",
-            "BoolStorage",
-            "BFloat16Storage",
-            "Tensor",
-        ):
-            return getattr(torch, name)
-        elif module == "numpy.core.multiarray" and name == "scalar":
-            return np.core.multiarray.scalar
-        elif module == "numpy" and name == "dtype":
-            return np.dtype
-        elif module == "_codecs" and name == "encode":
-            return _codecs.encode
-        else:
-            # Forbid everything else.
-            qualified_name = name if module == "__builtin__" else f"{module}.{name}"
-            raise pickle.UnpicklingError(
-                f"`{qualified_name}` is forbidden; the model you are loading probably contains malicious code. If you think this is incorrect ask the developer to unban the ability for {module} to execute {name}"
-            )
-
-    def load(self, *args, **kwargs):
-        self.original_persistent_load = getattr(
-            self, "persistent_load", pickle.Unpickler.persistent_load
-        )
-        self.persistent_load = self.forced_persistent_load
-        return super().load(*args, **kwargs)
-

 class _LazyUnpickler(RestrictedUnpickler):
    lazy_loaded_storages: Dict[str, LazyTensor]

    def __init__(self, *args, **kwargs):
-        # print(args, kwargs)
        self.lazy_loaded_storages = {}
        return super().__init__(*args, **kwargs)

@@ -376,7 +295,7 @@ def patch_safetensors(callback):
            # (70 tensors/s -> 65 tensor/s). The memory savings probably
            # shouldn't be the happening, maybe there's a memory leak
            # somewhere in our pipeline with CPU tensors.
-            intermediary_device = "cuda"
+            intermediary_device = "cuda:0"
        else:
            intermediary_device = "cpu"

@@ -409,27 +328,9 @@ def patch_safetensors(callback):
        return tensors

    transformers.modeling_utils.safe_load_file = safetensors_load
+    safetensors.torch.load_file = safetensors_load


-@contextlib.contextmanager
-def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler):
-    try:
-        old_unpickler = pickle.Unpickler
-        pickle.Unpickler = unpickler
-
-        old_pickle_load = pickle.load
-
-        def new_pickle_load(*args, **kwargs):
-            return pickle.Unpickler(*args, **kwargs).load()
-
-        pickle.load = new_pickle_load
-
-        yield
-
-    finally:
-        pickle.Unpickler = old_unpickler
-        pickle.load = old_pickle_load
-
 def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
    for hook in self._load_state_dict_pre_hooks.values():
        hook(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
--- a/modeling/logits_processors.py
+++ b/modeling/logits_processors.py
@@ -233,6 +233,8 @@ class PhraseBiasLogitsProcessor:
            token_seqs = self._get_token_sequence(phrase)
            variant_deltas = {}
            for token_seq in token_seqs:
+                if not token_seq:
+                    continue
                bias_index = self._find_intersection(input_ids, token_seq)

                # Ensure completion after completion_threshold tokens
@@ -267,6 +269,14 @@ class PhraseBiasLogitsProcessor:

        for batch in range(scores_shape[0]):
            for token, bias in self._get_biased_tokens(input_ids[batch]).items():
-                scores[batch][token] += bias
+                if bias > 0 and bool(scores[batch][token].isneginf()):
+                    # Adding bias to -inf will do NOTHING!!! So just set it for
+                    # now. There may be more mathishly correct way to do this
+                    # but it'll work. Also, make sure the bias is actually
+                    # positive. Don't give a -inf token more chance by setting
+                    # it to -0.5!
+                    scores[batch][token] = bias
+                else:
+                    scores[batch][token] += bias

        return scores
--- a/modeling/patches.py
+++ b/modeling/patches.py
@@ -129,15 +129,33 @@ def patch_transformers_generation() -> None:


 class LazyloadPatches:
+    class StateDictFacade(dict):
+        def __init__(self, state_dict):
+            self.update(state_dict)
+
+        def __getitem__(self, name):
+            return super().__getitem__(name).materialize(map_location="cuda:0")
+
    old_load_state_dict = transformers.modeling_utils._load_state_dict_into_meta_model
+    torch_old_load_from_state_dict = torch.nn.Module._load_from_state_dict

    def __enter__() -> None:
        transformers.modeling_utils._load_state_dict_into_meta_model = (
            LazyloadPatches._load_state_dict_into_meta_model
        )
+        torch.nn.Module._load_from_state_dict = LazyloadPatches._torch_load_from_state_dict

    def __exit__(exc_type, exc_value, exc_traceback) -> None:
        transformers.modeling_utils._load_state_dict_into_meta_model = LazyloadPatches.old_load_state_dict
+        torch.nn.Module._load_from_state_dict = LazyloadPatches.torch_old_load_from_state_dict
+
+    def _torch_load_from_state_dict(self, state_dict, *args, **kwargs):
+        return LazyloadPatches.torch_old_load_from_state_dict(
+            self,
+            LazyloadPatches.StateDictFacade(state_dict),
+            *args,
+            **kwargs
+        )

    def _load_state_dict_into_meta_model(
        model,
--- a/modeling/pickling.py
+++ b/modeling/pickling.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+import collections
+import contextlib
+import pickle
+
+import _codecs
+import numpy as np
+import torch
+from torch import Tensor
+
+import modeling
+
+
+def _patched_rebuild_from_type_v2(func, new_type, args, state):
+    """A patched version of torch._tensor._rebuild_from_type_v2 that
+    does not attempt to convert `LazyTensor`s to `torch.Tensor`s."""
+
+    ret = func(*args)
+
+    # BEGIN PATCH
+    transformation_ok = isinstance(ret, modeling.lazy_loader.LazyTensor) and new_type == Tensor
+    if type(ret) is not new_type and not transformation_ok:
+        # END PATCH
+        ret = ret.as_subclass(new_type)
+
+    # Tensor does define __setstate__ even though it doesn't define
+    # __getstate__. So only use __setstate__ if it is NOT the one defined
+    # on Tensor
+    if (
+        getattr(ret.__class__, "__setstate__", Tensor.__setstate__)
+        is not Tensor.__setstate__
+    ):
+        ret.__setstate__(state)
+    else:
+        ret = torch._utils._set_obj_state(ret, state)
+    return ret
+
+
+class RestrictedUnpickler(pickle.Unpickler):
+    def original_persistent_load(self, saved_id):
+        return super().persistent_load(saved_id)
+
+    def forced_persistent_load(self, saved_id):
+        if saved_id[0] != "storage":
+            raise pickle.UnpicklingError("`saved_id[0]` must be 'storage'")
+        return self.original_persistent_load(saved_id)
+
+    def find_class(self, module, name):
+        if module == "collections" and name == "OrderedDict":
+            return collections.OrderedDict
+        elif module == "torch._utils" and name in (
+            "_rebuild_tensor_v2",
+            "_rebuild_meta_tensor_no_storage",
+        ):
+            return getattr(torch._utils, name)
+        elif module == "torch._tensor" and name == "_rebuild_from_type_v2":
+            return _patched_rebuild_from_type_v2
+        elif module == "torch" and name in (
+            "DoubleStorage",
+            "FloatStorage",
+            "HalfStorage",
+            "LongStorage",
+            "IntStorage",
+            "ShortStorage",
+            "CharStorage",
+            "ByteStorage",
+            "BoolStorage",
+            "BFloat16Storage",
+            "Tensor",
+            "float16",
+        ):
+            return getattr(torch, name)
+        elif module == "numpy.core.multiarray" and name == "scalar":
+            return np.core.multiarray.scalar
+        elif module == "numpy" and name == "dtype":
+            return np.dtype
+        elif module == "_codecs" and name == "encode":
+            return _codecs.encode
+        else:
+            # Forbid everything else.
+            qualified_name = name if module == "__builtin__" else f"{module}.{name}"
+            raise pickle.UnpicklingError(
+                f"`{qualified_name}` is forbidden; the model you are loading probably contains malicious code. If you think this is incorrect ask the developer to unban the ability for {module} to execute {name}"
+            )
+
+    def load(self, *args, **kwargs):
+        self.original_persistent_load = getattr(
+            self, "persistent_load", pickle.Unpickler.persistent_load
+        )
+        self.persistent_load = self.forced_persistent_load
+        return super().load(*args, **kwargs)
+
+
+@contextlib.contextmanager
+def use_custom_unpickler(unpickler: pickle.Unpickler = RestrictedUnpickler):
+    try:
+        old_unpickler = pickle.Unpickler
+        pickle.Unpickler = unpickler
+
+        old_pickle_load = pickle.load
+
+        def new_pickle_load(*args, **kwargs):
+            return pickle.Unpickler(*args, **kwargs).load()
+
+        pickle.load = new_pickle_load
+        yield
+
+    finally:
+        pickle.Unpickler = old_unpickler
+        pickle.load = old_pickle_load
--- a/modeling/stoppers.py
+++ b/modeling/stoppers.py
@@ -3,15 +3,12 @@ from __future__ import annotations
 import torch

 import utils
-from modeling.inference_model import (
-    InferenceModel,
-)
-
+from modeling import inference_model

 class Stoppers:
    @staticmethod
    def core_stopper(
-        model: InferenceModel,
+        model: inference_model.InferenceModel,
        input_ids: torch.LongTensor,
    ) -> bool:
        if not utils.koboldai_vars.inference_config.do_core:
@@ -62,7 +59,7 @@ class Stoppers:

    @staticmethod
    def dynamic_wi_scanner(
-        model: InferenceModel,
+        model: inference_model.InferenceModel,
        input_ids: torch.LongTensor,
    ) -> bool:
        if not utils.koboldai_vars.inference_config.do_dynamic_wi:
@@ -93,7 +90,7 @@ class Stoppers:

    @staticmethod
    def chat_mode_stopper(
-        model: InferenceModel,
+        model: inference_model.InferenceModel,
        input_ids: torch.LongTensor,
    ) -> bool:
        if not utils.koboldai_vars.chatmode:
@@ -118,7 +115,7 @@ class Stoppers:

    @staticmethod
    def stop_sequence_stopper(
-        model: InferenceModel,
+        model: inference_model.InferenceModel,
        input_ids: torch.LongTensor,
    ) -> bool:
                
@@ -126,7 +123,12 @@ class Stoppers:
        # null_character = model.tokenizer.encode(chr(0))[0]
        if "completed" not in model.gen_state:
            model.gen_state["completed"] = [False] * len(input_ids)
-        
+        if utils.koboldai_vars.adventure:
+            extra_options = [">", "\n>"]
+            for option in extra_options:
+                if option not in utils.koboldai_vars.stop_sequence:
+                    utils.koboldai_vars.stop_sequence.append(option)
+
        #one issue is that the stop sequence may not actual align with the end of token 
        #if its a subsection of a longer token
        for stopper in utils.koboldai_vars.stop_sequence:
@@ -140,19 +142,31 @@ class Stoppers:
        if all(model.gen_state["completed"]):
            utils.koboldai_vars.generated_tkns = utils.koboldai_vars.genamt
            del model.gen_state["completed"]
+            if utils.koboldai_vars.adventure: # Remove added adventure mode stop sequences
+                for option in extra_options:
+                    if option in utils.koboldai_vars.stop_sequence:
+                        utils.koboldai_vars.stop_sequence.remove(option)
            return True
        return False

    @staticmethod
    def singleline_stopper(
-        model: InferenceModel,
+        model: inference_model.InferenceModel,
        input_ids: torch.LongTensor,
    ) -> bool:
-        """If singleline mode is enabled, it's pointless to generate output beyond the first newline."""
+        """Stop on occurances of newlines **if singleline is enabled**."""

+        # It might be better just to do this further up the line
        if not utils.koboldai_vars.singleline:
            return False
+        return Stoppers.newline_stopper(model, input_ids)

+    @staticmethod
+    def newline_stopper(
+        model: inference_model.InferenceModel,
+        input_ids: torch.LongTensor,
+    ) -> bool:
+        """Stop on occurances of newlines."""
        # Keep track of presence of newlines in each sequence; we cannot stop a
        # batch member individually, so we must wait for all of them to contain
        # a newline.
@@ -167,3 +181,30 @@ class Stoppers:
            del model.gen_state["newline_in_sequence"]
            return True
        return False
+
+    @staticmethod
+    def sentence_end_stopper(
+        model: inference_model.InferenceModel,
+        input_ids: torch.LongTensor,
+    ) -> bool:
+        """Stops at the end of sentences."""
+
+        # TODO: Make this more robust
+        SENTENCE_ENDS = [".", "?", "!"]
+
+        # We need to keep track of stopping for each batch, since we can't stop
+        # one individually.
+        if "sentence_end_in_sequence" not in model.gen_state:
+            model.gen_state["sentence_end_sequence"] = [False] * len(input_ids)
+
+        for sequence_idx, batch_sequence in enumerate(input_ids):
+            decoded = model.tokenizer.decode(batch_sequence[-1])
+            for end in SENTENCE_ENDS:
+                if end in decoded:
+                    model.gen_state["sentence_end_sequence"][sequence_idx] = True
+                    break
+
+        if all(model.gen_state["sentence_end_sequence"]):
+            del model.gen_state["sentence_end_sequence"]
+            return True
+        return False
--- a/requirements.txt
+++ b/requirements.txt
@@ -39,7 +39,6 @@ pytest-metadata==2.0.4
 requests-mock==1.10.0
 safetensors==0.3.1
 git+https://github.com/0cc4m/hf_bleeding_edge/
--find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html gptq_koboldai==0.0.4
 einops
 peft==0.3.0
-scipy
+scipy
--- a/static/klite.html
+++ b/static/klite.html
--- a/static/koboldai.css
+++ b/static/koboldai.css
@@ -1,10 +1,10 @@
 /*----------------Global Colors------------------*/
-@media only screen and (max-aspect-ratio: 7/5) {
+@media only screen and (max-aspect-ratio: 5/6) {
 	:root {
 		--flyout_menu_width: 100%;
 	}
 }
-@media only screen and (min-aspect-ratio: 7/5) {
+@media only screen and (min-aspect-ratio: 5/6) {
 	:root {
 		--flyout_menu_width: 402px;
 	}
@@ -448,19 +448,19 @@ border-top-right-radius: var(--tabs_rounding);
 	cursor: pointer;
 }

-@media only screen and (max-aspect-ratio: 7/5) {
+@media only screen and (max-aspect-ratio: 5/6) {
 /* mobile */
 .menu_icon.hidden {
 	display: inline-block !important;
 }
 }

-@media only screen and (max-aspect-ratio: 7/5) {
+@media only screen and (max-aspect-ratio: 5/6) {
 .menu_pin {
 	display: none;
 }
 }
-@media only screen and (min-aspect-ratio: 7/5) {
+@media only screen and (min-aspect-ratio: 5/6) {
 .menu_pin {
 	position: absolute;
 	top:10px;
@@ -516,7 +516,7 @@ border-top-right-radius: var(--tabs_rounding);
 	will-change: transform;
 }

-@media only screen and (min-aspect-ratio: 7/5) {
+@media only screen and (min-aspect-ratio: 5/6) {
 .SideMenu.pinned {
 	right: calc(100% - var(--flyout_menu_width));
 	background-color: var(--flyout_background_pinned);
@@ -906,7 +906,7 @@ border-top-right-radius: var(--tabs_rounding);
 	grid-area: lefticon;
 }

-@media only screen and (max-aspect-ratio: 7/5) {
+@media only screen and (max-aspect-ratio: 5/6) {
 /* mobile */
 .right_menu_icon.hidden {
 	display: inline-block !important;
@@ -937,7 +937,7 @@ border-top-right-radius: var(--tabs_rounding);
 	left: calc(100% - var(--flyout_menu_width));
 }

-@media only screen and (min-aspect-ratio: 7/5) {
+@media only screen and (min-aspect-ratio: 5/6) {
 /*  Desktop Mode  */
 .rightSideMenu.pinned {
 	left: calc(100% - var(--flyout_menu_width));
@@ -959,14 +959,14 @@ border-top-right-radius: var(--tabs_rounding);
 	filter: brightness(40%);
 }

-@media only screen and (max-aspect-ratio: 7/5) {
+@media only screen and (max-aspect-ratio: 5/6) {
 /* mobile */
 .story_menu_pin {
 	display: none;
 }
 }

-@media only screen and (min-aspect-ratio: 7/5) {
+@media only screen and (min-aspect-ratio: 5/6) {
 .story_menu_pin {
 	position: absolute;
 	top:10px;
@@ -1259,17 +1259,46 @@ td.server_vars {
 .world_info_label_container > .generate-button:hover { opacity: 1.0; }

 .tag {
+	display: inline-block;
 	background-color: var(--wi_tag_color);
 	color: var(--wi_tag_text_color);
+
+	margin-right: 3px;
+	margin-top: 3px;
+
 	padding: 2px;
-	margin-right: 2px;
+	padding-left: 3px;
+	padding-right: 3px;
+
 	border-radius: var(--radius_wi_card);
-	border: solid;
-	border-color: var(--wi_tag_color);
+}
+
+.tag .tag_button {
+	cursor: pointer;
+	opacity: 0.4;
+	font-size: 16px;
+	position: relative;
 }

 .tag .delete_icon {
-	cursor: pointer;
+	top: 3px;
+	right: 3px;
+}
+
+.tag .add_icon {
+	top: 3px;
+	right: 3px;
+}
+
+.tag .tag_text {
+	display: inline-block;
+	outline: none;
+	position: relative;
+	right: 3px;
+}
+
+.placeholder_tag .tag_text:empty {
+	opacity: 0.4;
 }

 .oi[folder] {
@@ -1457,6 +1486,30 @@ td.server_vars {
 	line-height: 2;
 }

+/* Privacy Mode (Lock Screen) */
+#privacy_mode {
+	height: unset;
+	width: unset;
+	position: relative;
+	top: unset;
+	left: unset;
+}
+
+#privacy_mode .popup_list_area {
+	display: flex;
+	align-items: center;
+	flex-direction: column;
+	padding-top: 10px;
+	padding-bottom: 10px;
+	padding-left: 15px;
+	padding-right: 15px;
+}
+
+#privacy_mode input {
+	margin-top: 15px;
+	width: 85%;
+}
+
 /* ---------------------------- OVERALL PAGE CONFIG ------------------------------*/
 body {
 	background-color: var(--background);
@@ -1489,7 +1542,7 @@ body {
 	background-color: #cacaca80;
 }

-@media only screen and (min-aspect-ratio: 7/5) {
+@media only screen and (min-aspect-ratio: 5/6) {
 /* ------------------- Desktop Mode --------------------------- */
 .main-grid {
 	transition: margin-left .5s, margin-right .5s;
@@ -1504,7 +1557,7 @@ body {
 	grid-template-columns: 30px auto 30% 30px;
 	grid-template-rows: auto min-content min-content 100px;
 }
-.main-grid[option_length="0"][model_numseqs="1"] {
+.main-grid[hide-options="true"] {
 	grid-template-columns: 30px auto 0px 30px;
 }

@@ -1523,7 +1576,7 @@ body {

 }

-@media only screen and (max-aspect-ratio: 7/5) {
+@media only screen and (max-aspect-ratio: 5/6) {
 /* mobile */
 .main-grid {
 	transition: margin-left .5s;
@@ -1589,39 +1642,43 @@ body {
 	font-style: italic;
 }

-.sequence_area {
+#token-stream-buffer {
+	white-space: pre-wrap;
+}
+
+#option-container {
 	margin-top: 10px;
 	grid-area: options;
 	background-color: var(--sequence_area_background);
 	overflow-y: scroll;
 }

-.sequence_area::-webkit-scrollbar {
+#option-container::-webkit-scrollbar {
 	display: none;
 }

-@media only screen and (max-aspect-ratio: 7/5) {
-.sequences {
-	margin-top: 5px;
-	width: 100%;
-	border: 0px;
-	border-spacing: 0;
-	display: flex;
-	flex-direction: row;
-	overflow-x: scroll;
-	scroll-snap-type: x mandatory;
-}
+@media only screen and (max-aspect-ratio: 5/6) {
+	#option-container {
+		margin-top: 5px;
+		width: 100%;
+		border: 0px;
+		border-spacing: 0;
+		display: flex;
+		flex-direction: row;
+		overflow-x: scroll;
+		scroll-snap-type: x mandatory;
+	}
 }

-@media only screen and (min-aspect-ratio: 7/5) {
-.sequences {
-	margin-top: 5px;
-	width: 100%;
-	border: 0px;
-	border-spacing: 0;
-	display: flex;
-	flex-direction: column;
-}
+@media only screen and (min-aspect-ratio: 5/6) {
+	#option-container {
+		margin-top: 5px;
+		width: 100%;
+		border: 0px;
+		border-spacing: 0;
+		display: flex;
+		flex-direction: column;
+	}
 }

 .sequence_row {
@@ -1907,7 +1964,7 @@ body {
 	overflow: hidden;
 }

-@media only screen and (min-aspect-ratio: 7/5) {
+@media only screen and (min-aspect-ratio: 5/6) {
 	.paddingimage {
 		grid-area: paddingimage;
 		margin: auto auto auto auto;
@@ -1915,14 +1972,14 @@ body {

 }

-@media only screen and (max-aspect-ratio: 7/5) {
+@media only screen and (max-aspect-ratio: 5/6) {
 	.paddingimage {
 		visibility: hidden;
 	}
 }

 /*---------------------------------- Popups -------------------------------------------------*/
-@media only screen and (max-aspect-ratio: 7/5) {
+@media only screen and (max-aspect-ratio: 5/6) {
 	.popup {
 		position: absolute;
 		top: 10vh;
@@ -1939,7 +1996,7 @@ body {
 	}
 }

-@media only screen and (min-aspect-ratio: 7/5) {
+@media only screen and (min-aspect-ratio: 5/6) {
 	.popup {
 		position: absolute;
 		top: 10vh;
@@ -1962,6 +2019,7 @@ body {
 	color: var(--popup_title_bar_color_text);
 	text-align: center;
 	font-size: calc(1.3em + var(--font_size_adjustment));
+	user-select: none;
 }

 .popup .action_button {
@@ -2705,13 +2763,14 @@ body {
 #context-menu > hr {
 	/* Division Color*/
 	border-top: 2px solid var(--context_menu_division);
-	margin: 5px 5px;
+	margin: 3px 5px;
 }

 .context-menu-item {
-	padding: 5px;
+	padding: 4px;
 	padding-right: 25px;
 	min-width: 100px;
+	white-space: nowrap;
 }

 .context-menu-item:hover {
@@ -2722,11 +2781,16 @@ body {

 .context-menu-item > .material-icons-outlined {
 	position: relative;
-	top: 2px;
+	top: 3px;
 	font-size: 15px;
 	margin-right: 5px;
 }

+.context-menu-item > .context-menu-label {
+	position: relative;
+	top: 1px;
+}
+
 /* Substitutions */
 #Substitutions {
 	margin-left: 10px;
@@ -2820,6 +2884,10 @@ body {
 	height: 100%;
 }

+#welcome_text a {
+	text-decoration: underline;
+}
+
 .welcome_text {
 	display: flex;
 	height: 100%;
@@ -2848,6 +2916,7 @@ body {
 	display: flex;
 	justify-content: center;
 	align-items: center;
+	pointer-events: none;
 }

 #welcome-text-content {
@@ -3537,10 +3606,15 @@ h2 .material-icons-outlined {
 }

 .section_header {
+	font-weight: bold;
 	margin-left: 2px;
 	margin-bottom: 2px;
 }

+.story_category_area > * > label {
+	user-select: none
+}
+
 .help_text {
 	margin-left: 6px;
 	margin-bottom: 0.7em;
--- a/static/koboldai.js
+++ b/static/koboldai.js
--- a/templates/index_new.html
+++ b/templates/index_new.html
@@ -44,7 +44,7 @@
 	</div>

 	<!------------ Main Screen--------------------->
-	<div id="main-grid" class="main-grid settings_pinned var_sync_alt_model_numseqs" onclick="close_menus();" option_length="0">
+	<div id="main-grid" class="main-grid settings_pinned var_sync_alt_model_numseqs" onclick="close_menus();" hide-options="true">
 		<!------------ Game Text Screen--------------------->
 		<div class="gamescreen" id="gamescreen" context-menu="gamescreen">
 			<div id="disconnect_message"><center><h1>Disconnected</h1></center></div>
@@ -53,13 +53,13 @@
 				<div id="welcome_text" class="var_sync_model_welcome" draggable="False"></div>
 			</div>

-			<div class="gametext" id="Selected Text" contenteditable=false tabindex=0 onpaste="check_game_after_paste()" onfocusout="savegametextchanges();" onclick="return set_edit(event)" onkeyup="return set_edit(event);">
+			<div class="gametext" id="Selected Text" contenteditable="false" tabindex="0" onkeyup="return set_edit(event);">
 				<span id="story_prompt" class="var_sync_story_prompt var_sync_alt_story_prompt_in_ai rawtext hidden" chunk="-1"></span></div><!--don't move the /div down or it'll cause odd spacing issues in the UI--->
 		</div>

 		<!------------ Sequences --------------------->
 		<div id="action_count" class="var_sync_actions_Action_Count hidden"></div>
-		<div id="Select Options" class="sequence_area"></div>
+		<div id="option-container" class="hidden"></div>

 		<!-- Story Review -->
 		<div id="story-review" class="hidden">
@@ -110,9 +110,9 @@
 					<button type="button" class="btn action_button" style="width: 30px; padding: 0px;" onclick='play_pause_tts()' aria-label="play"><span id="play_tts" class="material-icons-outlined" style="font-size: 1.4em;">play_arrow</span></button>
 					<button type="button" class="btn action_button" style="width: 30px; padding: 0px;" onclick='stop_tts()' aria-label="play"><span id="stop_tts" class="material-icons-outlined" style="font-size: 1.4em;">stop</span></button>
 			</span>
-			<button type="button" class="btn action_button submit var_sync_alt_system_aibusy" system_aibusy=False id="btnsubmit" onclick="storySubmit();">Submit</button>
+			<button type="button" class="btn action_button submit var_sync_alt_system_aibusy" system_aibusy=False id="btnsubmit" onclick="storySubmit();" context-menu="submit-button">Submit</button>
 			<button type="button" class="btn action_button submited var_sync_alt_system_aibusy"  system_aibusy=False id="btnsent"><img id="thinking" src="static/thinking.gif" class="force_center" onclick="socket.emit('abort','');"></button>
-			<button type="button" class="btn action_button back var_sync_alt_system_aibusy" system_aibusy=False onclick="storyBack();" aria-label="undo"><span class="material-icons-outlined" style="font-size: 1.4em;">replay</span></button>
+			<button type="button" class="btn action_button back var_sync_alt_system_aibusy" system_aibusy=False onclick="storyBack();" aria-label="undo" context-menu="undo-button"><span class="material-icons-outlined" style="font-size: 1.4em;">replay</span></button>
 			<button type="button" class="btn action_button redo var_sync_alt_system_aibusy" system_aibusy=False onclick="storyRedo();" aria-label="redo"><span class="material-icons-outlined" style="font-size: 1.4em;">arrow_forward</span></button>
 			<button type="button" class="btn action_button retry var_sync_alt_system_aibusy" system_aibusy=False onclick="storyRetry();" aria-label="retry"><span class="material-icons-outlined" style="font-size: 1.4em;">autorenew</span></button>
 		</div>
--- a/templates/popups.html
+++ b/templates/popups.html
@@ -70,12 +70,12 @@
 		</div>
 	</div>
 	<!---------------- Private Mode Unlock screen ---------------------->
-	<div id="privacy_mode" class="popup-window popup">
+	<div id="privacy_mode" class="popup-window popup" allow-close="false">
 		<div class="title">
 			<div class="popuptitletext">Locked</div>
 		</div>
 		<div id="popup_list_area" class="popup_list_area">
-			This story is in private mode. Please enter password to unlock<br/>
+			This story is in private mode. Please enter the password to unlock it.<br/>
 			<input type="password" id="privacy_password"/>
 		</div>
 		<div class="popup_load_cancel">
--- a/templates/story
+++ b/templates/story
@@ -50,6 +50,14 @@
 			<label for="authors_notes">Author's Notes:</label><br/>
 			<textarea autocomplete="off" rows=16 id="authors_notes" class="var_sync_story_authornote var_sync_alt_story_authornote_length fullwidth" oninput="autoResize(this)" onchange='sync_to_server(this);'></textarea><br/>

+			<div class="setting_tile_area">
+				{% with menu='author_notes' %}
+					{% with sub_path='' %}
+						{% include 'settings item.html' %}
+					{% endwith %}
+				{% endwith %}
+			</div>
+
 			<h4 class="section_header">Genre</h4>
 			<div class="help_text">Styles the AI will attempt to imitate. Effectiveness depends on model.</div>
 			<input id="genre-input" class="fullwidth" placeholder="Fantasy" autocomplete="off" spellcheck="false">
@@ -75,14 +83,6 @@
 					}
 				</script>
 			</div>
-
-			<div class="setting_tile_area">
-				{% with menu='author_notes' %}
-					{% with sub_path='' %}
-						{% include 'settings item.html' %}
-					{% endwith %}
-				{% endwith %}
-			</div>
 		</div>
 	</div>
 	<div id="story_menu_notes" class="story_category_area tab-target tab-target-story hidden">
@@ -97,7 +97,7 @@
 	<div id="story_menu_wi" class="story_category_area tab-target tab-target-story hidden">
 		<h4 class="section_header" style="margin-left: 12px;">World Info</h4>
 		<div class="help_text" style="margin-left: 20px;">
-			Lore information, which the AI recalls by certain words.
+			Lore information, which the AI recalls with the mention of certain words.
 			<span class="helpicon material-icons-outlined" tooltip="Use this instead of Memory for information on things like characters, objects, events, places, and anything else with detail.">help_icon</span>
 		</div>
 		<div class="setting_tile_area wi_settings">
--- a/templates/templates.html
+++ b/templates/templates.html
@@ -22,12 +22,16 @@
 				<span
 					class="world_info_item_type"
 					contenteditable="true"
-					data-placeholder="Person"
+					data-placeholder="..."
 					spellcheck="false"
-				></span> <span class="helpicon material-icons-outlined" tooltip="Please enter a noun that describes a person, place or thing." "]">help_icon</span>
+				></span>
+				<span
+					class="helpicon material-icons-outlined"
+					tooltip='Please enter a noun that describes this entry. For example, "person", "weapon", or "building". This will be used with the Generate Content button below.'
+				>help_icon</span>
 			</div>
 		</div>
-		<span id="world_info_delete_" class="world_info_delete">X</span>
+		<span id="world_info_delete_" class="world_info_delete material-icons-outlined">close</span>
 	</div>

 	<div class="world_info_upper_container world_info_tag_area">
--- a/themes/tweaks/hide-welcome-logo.css
+++ b/themes/tweaks/hide-welcome-logo.css
@@ -1 +1 @@
-#welcome_text { display:none; pointer-events: none }
+#welcome-logo { display:none; pointer-events: none }
--- a/tpu_mtj_backend.py
+++ b/tpu_mtj_backend.py
@@ -1116,10 +1116,11 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword
    thread_resources_env = maps.ResourceEnv(maps.Mesh(devices, ('dp', 'mp')), ())
    maps.thread_resources.env = thread_resources_env
    if initial_load:
-        logger.message(f"KoboldAI has finished loading and is available at the following link for UI 1: {koboldai_vars.cloudflare_link}")
-        logger.message(f"KoboldAI has finished loading and is available at the following link for UI 2: {koboldai_vars.cloudflare_link}/new_ui")
-        logger.message(f"KoboldAI has finished loading and is available at the following link for KoboldAI Lite: {koboldai_vars.cloudflare_link}/lite")
-        logger.message(f"KoboldAI has finished loading and is available at the following link for the API: {koboldai_vars.cloudflare_link}/api")
+        logger.message(f"KoboldAI has still loading your model but available at the following link: {koboldai_vars.cloudflare_link}")
+        logger.message(f"KoboldAI has still loading your model but available at the following link for the Classic UI: {koboldai_vars.cloudflare_link}/classic")
+        logger.message(f"KoboldAI has still loading your model but available at the following link for KoboldAI Lite: {koboldai_vars.cloudflare_link}/lite")
+        logger.message(f"KoboldAI has still loading your model but available at the following link for the API: [Loading Model...]")
+        logger.message(f"While the model loads you can use the above links to begin setting up your session, for generations you must wait until after its done loading.")

    global badwords
    # These are the tokens that we don't want the AI to ever write
@@ -1302,7 +1303,7 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword
                    except Exception as e:
                        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
            try:
-                model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
+                model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache", use_safetensors=False)
            except Exception as e:
                model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.custmodpth, revision=koboldai_vars.revision, cache_dir="cache")
        elif(os.path.isdir("models/{}".format(koboldai_vars.model.replace('/', '_')))):
@@ -1317,7 +1318,7 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword
                    except Exception as e:
                        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
            try:
-                model     = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache")
+                model     = AutoModelForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache", use_safetensors=False)
            except Exception as e:
                model     = GPTNeoForCausalLM.from_pretrained("models/{}".format(koboldai_vars.model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache")
        else:
@@ -1332,7 +1333,7 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword
                    except Exception as e:
                        tokenizer = GPT2Tokenizer.from_pretrained("gpt2", revision=koboldai_vars.revision, cache_dir="cache")
            try:
-                model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache")
+                model     = AutoModelForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache", use_safetensors=False)
            except Exception as e:
                model     = GPTNeoForCausalLM.from_pretrained(koboldai_vars.model, revision=koboldai_vars.revision, cache_dir="cache")