upstream merge

2025-06-05 21:59:24 +02:00 · 2022-02-02 15:04:59 -05:00
parent 1470b1666d c14e6fe5d2
commit e2d2ebcae6
11 changed files with 244 additions and 130 deletions
--- a/aiserver.py
+++ b/aiserver.py
@ -11,6 +11,7 @@ eventlet.monkey_patch(all=True, thread=False)
 import os
 os.system("")
 os.environ['EVENTLET_THREADPOOL_SIZE'] = '1'
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 from eventlet import tpool

 from os import path, getcwd
@ -23,6 +24,8 @@ import packaging
 import contextlib
 import traceback
 import threading
+import markdown
+import bleach
 from collections.abc import Iterable
 from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List

@ -65,22 +68,22 @@ class colors:
 modellist = [
    ["Load a model from its directory", "NeoCustom", ""],
    ["Load an old GPT-2 model (eg CloverEdition)", "GPT2Custom", ""],
-    ["Skein 6B (Hybrid)", "KoboldAI/GPT-J-6B-Skein", "12GB"],
-    ["Adventure 6B", "KoboldAI/GPT-J-6B-Adventure", "12GB"],
-    ["Lit 6B (NSFW)", "hakurei/lit-6B", "12GB"],
-    ["C1 6B (Chatbot)", "hakurei/c1-6B", "12GB"],
-    ["Picard 2.7B (Novel)", "KoboldAI/GPT-Neo-2.7B-Picard", "6GB"],
-    ["Adventure 2.7B", "KoboldAI/GPT-Neo-2.7B-AID", "6GB"],
-    ["Horni 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Horni", "6GB"],
-    ["Horni-LN 2.7B (Novel)", "KoboldAI/GPT-Neo-2.7B-Horni-LN", "6GB"],
-    ["Shinen 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Shinen", "6GB"],
-    ["GPT-J 6B", "EleutherAI/gpt-j-6B", "12GB"],
-    ["GPT-Neo 2.7B", "EleutherAI/gpt-neo-2.7B", "6GB"],
-    ["GPT-Neo 1.3B", "EleutherAI/gpt-neo-1.3B", "3GB"],
-    ["GPT-2 XL", "gpt2-xl", "8GB"],
+    ["Skein 6B (Hybrid)", "KoboldAI/GPT-J-6B-Skein", "16GB"],
+    ["Adventure 6B", "KoboldAI/GPT-J-6B-Adventure", "16GB"],
+    ["Lit 6B (NSFW)", "hakurei/lit-6B", "16GB"],
+    ["C1 6B (Chatbot)", "hakurei/c1-6B", "16GB"],
+    ["Picard 2.7B (Novel)", "KoboldAI/GPT-Neo-2.7B-Picard", "8GB"],
+    ["Adventure 2.7B", "KoboldAI/GPT-Neo-2.7B-AID", "8GB"],
+    ["Horni 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Horni", "8GB"],
+    ["Horni-LN 2.7B (Novel)", "KoboldAI/GPT-Neo-2.7B-Horni-LN", "8GB"],
+    ["Shinen 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Shinen", "8GB"],
+    ["GPT-J 6B", "EleutherAI/gpt-j-6B", "16GB"],
+    ["GPT-Neo 2.7B", "EleutherAI/gpt-neo-2.7B", "8GB"],
+    ["GPT-Neo 1.3B", "EleutherAI/gpt-neo-1.3B", "6GB"],
+    ["GPT-2 XL", "gpt2-xl", "6GB"],
    ["GPT-2 Large", "gpt2-large", "4GB"],
    ["GPT-2 Med", "gpt2-medium", "2GB"],
-    ["GPT-2", "gpt2", "1GB"],
+    ["GPT-2", "gpt2", "2GB"],
    ["OpenAI API (requires API key)", "OAI", ""],
    ["InferKit API (requires API key)", "InferKit", ""],
    ["KoboldAI Server API (Old Google Colab)", "Colab", ""],
@ -183,6 +186,7 @@ class vars:
    useprompt   = False   # Whether to send the full prompt with every submit action
    breakmodel  = False  # For GPU users, whether to use both system RAM and VRAM to conserve VRAM while offering speedup compared to CPU-only
    bmsupported = False  # Whether the breakmodel option is supported (GPT-Neo/GPT-J only, currently)
+    nobreakmodel = False  # Something specifically requested Breakmodel to be disabled (For example a models config)
    smandelete  = False  # Whether stories can be deleted from inside the browser
    smanrename  = False  # Whether stories can be renamed from inside the browser
    allowsp     = False  # Whether we are allowed to use soft prompts (by default enabled if we're using GPT-2, GPT-Neo or GPT-J)
@ -202,6 +206,8 @@ class vars:
    nopromptgen = False
    rngpersist  = False
    nogenmod    = False
+    welcome     = False  # Custom Welcome Text (False is default)
+    newlinemode = "n"
    quiet       = False # If set will suppress any story text from being printed to the console (will only be seen on the client web page)
    debug       = False # If set to true, will send debug information to the client for display

@ -209,7 +215,7 @@ class vars:
 # Function to get model selection at startup
 #==================================================================#
 def getModelSelection():
-    print("    #   Model                           V/RAM\n    =========================================")
+    print("    #   Model                           VRAM\n    =========================================")
    i = 1
    for m in modellist:
        print("    {0} - {1}\t\t{2}".format("{:<2}".format(i), m[0].ljust(15), m[2]))
@ -301,7 +307,7 @@ def device_config(model):
            assert sum(breakmodel.gpu_blocks) <= n_layers
            n_layers -= sum(breakmodel.gpu_blocks)
        except:
-            print("WARNING: --layers is malformatted. Please use the --help option to see correct usage of --layers. Defaulting to all layers on device 0.", file=sys.stderr)
+            print("WARNING: --breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0.", file=sys.stderr)
            breakmodel.gpu_blocks = [n_layers]
            n_layers = 0
    elif(args.breakmodel_layers is not None):
@ -389,6 +395,54 @@ def device_config(model):
    generator = model.generate
    breakmodel.move_hidden_layers(model.transformer)

+#==================================================================#
+#  Allow the models to override some settings
+#==================================================================#
+def loadmodelsettings():
+    try:
+        model_js_config = str(model_config).partition(' ')[2]
+        js   = json.loads(model_js_config)
+    except Exception as e:
+        try:
+            model_js_config = open(vars.custmodpth + "/config.json", "r")
+        except Exception as e:
+            model_js_config = open(vars.custmodpth.replace('/', '_') + "/config.json", "r")
+        js   = json.load(model_js_config)
+    if("badwordsids" in js):
+        vars.badwordsids = js["badwordsids"]
+    if("nobreakmodel" in js):
+        vars.nobreakmodel = js["nobreakmodel"]
+    if("temp" in js):
+        vars.temp       = js["temp"]
+    if("top_p" in js):
+        vars.top_p      = js["top_p"]
+    if("top_k" in js):
+        vars.top_k      = js["top_k"]
+    if("tfs" in js):
+        vars.tfs        = js["tfs"]
+    if("rep_pen" in js):
+        vars.rep_pen    = js["rep_pen"]
+    if("rep_pen_slope" in js):
+        vars.rep_pen_slope = js["rep_pen_slope"]
+    if("rep_pen_range" in js):
+        vars.rep_pen_range = js["rep_pen_range"]
+    if("adventure" in js):
+        vars.adventure = js["adventure"]
+    if("chatmode" in js):
+        vars.chatmode = js["chatmode"]
+    if("dynamicscan" in js):
+        vars.dynamicscan = js["dynamicscan"]
+    if("formatoptns" in js):
+        vars.formatoptns = js["formatoptns"]
+    if("welcome" in js):
+        vars.welcome = js["welcome"]
+    if("newlinemode" in js):
+        vars.newlinemode = js["newlinemode"]
+    if("antemplate" in js):
+        vars.setauthornotetemplate = js["antemplate"]
+        if(not vars.gamestarted):
+            vars.authornotetemplate = vars.setauthornotetemplate
+
 #==================================================================#
 # Startup
 #==================================================================#
@ -402,11 +456,12 @@ parser.add_argument("--path", help="Specify the Path for local models (For model
 parser.add_argument("--cpu", action='store_true', help="By default unattended launches are on the GPU use this option to force CPU usage.")
 parser.add_argument("--breakmodel", action='store_true', help=argparse.SUPPRESS)
 parser.add_argument("--breakmodel_layers", type=int, help=argparse.SUPPRESS)
-parser.add_argument("--breakmodel_gpulayers", type=str, help="If using a model that supports hybrid generation, this is a comma-separated list that specifies how many layers to put on each GPU device. For example to put 8 layers on device 0, 9 layers on device 1 and 11 layers on device 2, use --layers 8,9,11")
+parser.add_argument("--breakmodel_gpulayers", type=str, help="If using a model that supports hybrid generation, this is a comma-separated list that specifies how many layers to put on each GPU device. For example to put 8 layers on device 0, 9 layers on device 1 and 11 layers on device 2, use --beakmodel_gpulayers 8,9,11")
 parser.add_argument("--override_delete", action='store_true', help="Deleting stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow deleting stories if using --remote and prevent deleting stories otherwise.")
 parser.add_argument("--override_rename", action='store_true', help="Renaming stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow renaming stories if using --remote and prevent renaming stories otherwise.")
 parser.add_argument("--configname", help="Force a fixed configuration name to aid with config management.")
 parser.add_argument("--colab", action='store_true', help="Optimize for Google Colab.")
+parser.add_argument("--nobreakmodel", action='store_true', help="Disables Breakmodel support completely.")
 parser.add_argument("--share", action='store_true', default=False, help="If present will launch KoboldAI available to all computers rather than local only")
 parser.add_argument("--quiet", action='store_true', default=False, help="If present will suppress any story related text from showing on the console")

@ -426,7 +481,11 @@ if args.colab:
    args.remote = True;
    args.override_rename = True;
    args.override_delete = True;
-    
+    args.nobreakmodel = True;
+
+if args.nobreakmodel:
+    vars.nobreakmodel = True;
+
 if args.remote:
    vars.remote = True;

@ -445,7 +504,7 @@ if args.model:
        vars.colaburl = args.path + "/request"; # Lets just use the same parameter to keep it simple

 else:
-    print("{0}Welcome to the KoboldAI Server!\nSelect an AI model to continue:{1}\n".format(colors.CYAN, colors.END))
+    print("{0}Welcome to the KoboldAI Server!\nListed RAM is the optimal VRAM and CPU ram can be up to twice the amount.\nMost models can run at less VRAM with reduced max tokens or less layers on the GPU.\nSelect an AI model to continue:{1}\n".format(colors.CYAN, colors.END))
    getModelSelection()

 # If transformers model was selected & GPU available, ask to use CPU or GPU
@ -482,15 +541,19 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
    elif(vars.model_type == "not_found"):
        print("WARNING: No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)")
        vars.model_type = "gpt_neo"
+    loadmodelsettings()
    print("{0}Looking for GPU support...{1}".format(colors.PURPLE, colors.END), end="")
    vars.hascuda = torch.cuda.is_available()
-    vars.bmsupported = vars.model_type in ("gpt_neo", "gptj") and not args.colab
+    vars.bmsupported = vars.model_type in ("gpt_neo", "gptj") and not vars.nobreakmodel
    if(args.breakmodel is not None and args.breakmodel):
-        print("WARNING: --breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --layers is used (see --help for details).", file=sys.stderr)
+        print("WARNING: --breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --breakmodel_gpulayers is used (see --help for details).", file=sys.stderr)
    if(args.breakmodel_layers is not None):
-        print("WARNING: --breakmodel_layers is deprecated. Use --layers instead (see --help for details).", file=sys.stderr)
+        print("WARNING: --breakmodel_layers is deprecated. Use --breakmodel_gpulayers instead (see --help for details).", file=sys.stderr)
+    if(args.model and vars.bmsupported and not args.breakmodel_gpulayers and not args.breakmodel_layers):
+        print("WARNING: Model launched without the --breakmodel_gpulayers argument, defaulting to GPU only mode.", file=sys.stderr)
+        vars.bmsupported = False
    if(not vars.bmsupported and (args.breakmodel_gpulayers is not None or args.breakmodel_layers is not None)):
-        print("WARNING: This model does not support hybrid generation. --layers will be ignored.", file=sys.stderr)
+        print("WARNING: This model does not support hybrid generation. --breakmodel_gpulayers will be ignored.", file=sys.stderr)
    if(vars.hascuda):
        print("{0}FOUND!{1}".format(colors.GREEN, colors.END))
    else:
@ -917,7 +980,7 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
                       model     = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/", **lowmem)
                   except ValueError as e:
                       model     = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/", **lowmem)
-            elif(os.path.isdir(format(vars.model.replace('/', '_')))):
+            elif(os.path.isdir(vars.model.replace('/', '_'))):
               with(maybe_use_float16()):
                   try:
                       tokenizer = AutoTokenizer.from_pretrained(vars.model.replace('/', '_'), cache_dir="cache/")
@ -1071,6 +1134,7 @@ else:
        print("{0}Initializing Mesh Transformer JAX, please wait...{1}".format(colors.PURPLE, colors.END))
        assert vars.model == "TPUMeshTransformerGPTJ" and vars.custmodpth and os.path.isdir(vars.custmodpth)
        import tpu_mtj_backend
+        tpu_mtj_backend.vars = vars
        tpu_mtj_backend.warper_callback = tpumtjgenerate_warper_callback
        tpu_mtj_backend.stopping_callback = tpumtjgenerate_stopping_callback
        tpu_mtj_backend.compiling_callback = tpumtjgenerate_compiling_callback
@ -2129,14 +2193,25 @@ def sendUSStatItems():
    emit('from_server', {'cmd': 'usstatitems', 'data': loaded, 'flash': last_userscripts != vars.last_userscripts}, broadcast=True)
    vars.last_userscripts = last_userscripts

+#==================================================================#
+#  KoboldAI Markup Formatting (Mixture of Markdown and sanitized html)
+#==================================================================#
+def kml(txt):
+   txt = txt.replace('>', '&gt;')
+   txt = bleach.clean(markdown.markdown(txt), tags = ['p', 'em', 'strong', 'code', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'ul', 'b', 'i', 'a', 'span', 'button'], styles = ['color', 'font-weight'], attributes=['id', 'class', 'style', 'href'])
+   return txt
+
 #==================================================================#
 #  Send start message and tell Javascript to set UI state
 #==================================================================#
 def setStartState():
-    txt = "<span>Welcome to <span class=\"color_cyan\">KoboldAI</span>! You are running <span class=\"color_green\">"+getmodelname()+"</span>.<br/>"
-    if(not vars.noai):
-        txt = txt + "Please load a game or enter a prompt below to begin!</span>"
+    if(vars.welcome):
+        txt = kml(vars.welcome) + "<br/>"
    else:
+        txt = "<span>Welcome to <span class=\"color_cyan\">KoboldAI</span>! You are running <span class=\"color_green\">"+getmodelname()+"</span>.<br/>"
+    if(not vars.noai and not vars.welcome):
+        txt = txt + "Please load a game or enter a prompt below to begin!</span>"
+    if(vars.noai):
        txt = txt + "Please load or import a story to read. There is no AI in this mode."
    emit('from_server', {'cmd': 'updatescreen', 'gamestarted': vars.gamestarted, 'data': txt}, broadcast=True)
    emit('from_server', {'cmd': 'setgamestate', 'data': 'start'}, broadcast=True)
@ -2190,6 +2265,8 @@ def savesettings():
    js["rngpersist"]  = vars.rngpersist
    js["nogenmod"]    = vars.nogenmod
    js["autosave"]    = vars.autosave
+    js["welcome"]     = vars.welcome
+    js["newlinemode"] = vars.newlinemode

    js["antemplate"]  = vars.setauthornotetemplate

@ -2264,6 +2341,10 @@ def loadsettings():
            vars.nogenmod = js["nogenmod"]
        if("autosave" in js):
            vars.autosave = js["autosave"]
+        if("newlinemode" in js):
+            vars.newlinemode = js["newlinemode"]
+        if("welcome" in js):
+            vars.welcome = js["welcome"]

        if("antemplate" in js):
            vars.setauthornotetemplate = js["antemplate"]
@ -2291,47 +2372,6 @@ def loadsettings():

        file.close()

-#==================================================================#
-#  Allow the models to override some settings
-#==================================================================#
-def loadmodelsettings():
-    try:
-        model_js_config = str(model_config).partition(' ')[2]
-        js   = json.loads(model_js_config)
-    except Exception as e:
-        try:
-            model_js_config = open(vars.custmodpth + "/config.json", "r")
-        except Exception as e:
-            model_js_config = open(vars.custmodpth.replace('/', '_') + "/config.json", "r")
-        js   = json.load(model_js_config)
-    if("badwordsids" in js):
-        vars.badwordsids = js["badwordsids"]
-    if("temp" in js):
-        vars.temp       = js["temp"]
-    if("top_p" in js):
-        vars.top_p      = js["top_p"]
-    if("top_k" in js):
-        vars.top_k      = js["top_k"]
-    if("tfs" in js):
-        vars.tfs        = js["tfs"]
-    if("rep_pen" in js):
-        vars.rep_pen    = js["rep_pen"]
-    if("rep_pen_slope" in js):
-        vars.rep_pen_slope = js["rep_pen_slope"]
-    if("rep_pen_range" in js):
-        vars.rep_pen_range = js["rep_pen_range"]
-    if("adventure" in js):
-        vars.adventure = js["adventure"]
-    if("chatmode" in js):
-        vars.chatmode = js["chatmode"]
-    if("dynamicscan" in js):
-        vars.dynamicscan = js["dynamicscan"]
-    if("formatoptns" in js):
-        vars.formatoptns = js["formatoptns"]
-    if("antemplate" in js):
-        vars.setauthornotetemplate = js["antemplate"]
-        if(not vars.gamestarted):
-            vars.authornotetemplate = vars.setauthornotetemplate

 #==================================================================#
 #  Don't save settings unless 2 seconds have passed without modification
@ -2369,7 +2409,7 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False,
    # Ignore new submissions if the AI is currently busy
    if(vars.aibusy):
        return
-
+    
    while(True):
        set_aibusy(1)

@ -2392,7 +2432,11 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False,
            data = re.sub(r'\n+', ' ', data)
            if(len(data)):
                data = f"\n{vars.chatname} : {data}\n"
-                
+        
+        # </s> mode
+        if(vars.newlinemode == "s"):
+            data = data.replace('\n', "</s>")
+        
        # If we're not continuing, store a copy of the raw input
        if(data != ""):
            vars.lastact = data
@ -2744,7 +2788,7 @@ def calcsubmit(txt):
    actionlen    = len(vars.actions)

    winfo, mem, anotetxt, found_entries = calcsubmitbudgetheader(txt)
-
+ 
    # For all transformers models
    if(vars.model != "InferKit"):
        subtxt, min, max = calcsubmitbudget(actionlen, winfo, mem, anotetxt, vars.actions, submission=txt)
@ -3296,7 +3340,7 @@ def tpumtjgenerate(txt, minimum, maximum, found_entries=None):
 # Replaces returns and newlines with HTML breaks
 #==================================================================#
 def formatforhtml(txt):
-    return txt.replace("\\r\\n", "<br/>").replace("\\r", "<br/>").replace("\\n", "<br/>").replace("\r\n", "<br/>").replace('\n', '<br/>').replace('\r', '<br/>')
+    return txt.replace("\\r\\n", "<br/>").replace("\\r", "<br/>").replace("\\n", "<br/>").replace("\r\n", "<br/>").replace('\n', '<br/>').replace('\r', '<br/>').replace('&lt;/s&gt;', '<br/>')

 #==================================================================#
 # Strips submitted text from the text returned by the AI
@ -3323,13 +3367,16 @@ def applyinputformatting(txt):
    # Add sentence spacing
    if(vars.formatoptns["frmtadsnsp"]):
        txt = utils.addsentencespacing(txt, vars)
-    
+ 
    return txt

 #==================================================================#
 # Applies chosen formatting options to text returned from AI
 #==================================================================#
 def applyoutputformatting(txt):
+    # Revert S mode on output to maintain compatibility
+    txt = txt.replace('</s>', "\n")
+
    # Use standard quotes and apostrophes
    txt = utils.fixquotes(txt)

@ -4781,8 +4828,6 @@ def randomGameRequest(topic, memory=""):
    emit('from_server', {'cmd': 'setmemory', 'data': vars.memory}, broadcast=True)

 # Load desired settings from both the model and the users config file
-if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransformerGPTJ"]):
-    loadmodelsettings()
 loadsettings()

 # Prevent tokenizer from taking extra time the first time it's used