Merge branch 'united' of https://github.com/ebolam/KoboldAI into united

2025-06-05 21:59:24 +02:00 · 2022-08-31 09:34:24 -04:00
parent 1031b70731 39944c4258
commit 569f4cbce4
1 changed files with 119 additions and 18 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -217,6 +217,7 @@ model_menu = {
        ["InferKit API (requires API key)", "InferKit", "", False],
        # ["KoboldAI Server API (Old Google Colab)", "Colab", "", False],
        ["KoboldAI API", "API", "", False],
+        ["KoboldAI Cluster", "CLUSTER", "", False],
        ["Return to Main Menu", "mainmenu", "", True],
    ]
    }
@@ -318,6 +319,7 @@ class vars:
    colaburl    = ""     # Ngrok url for Google Colab mode
    apikey      = ""     # API key to use for InferKit API calls
    oaiapikey   = ""     # API key to use for OpenAI API calls
+    cluster_requested_models = [] # The models which we allow to generate during cluster mode
    savedir     = getcwd()+"\\stories"
    hascuda     = False  # Whether torch has detected CUDA on the system
    usegpu      = False  # Whether to launch pipeline with GPU support
@@ -1287,6 +1289,8 @@ def general_startup(override_args=None):
    parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)")
    parser.add_argument("--model", help="Specify the Model Type to skip the Menu")
    parser.add_argument("--path", help="Specify the Path for local models (For model NeoCustom or GPT2Custom)")
+    parser.add_argument("--apikey", help="Specify the API key to use for online services")
+    parser.add_argument("--req_model", type=str, action='append', required=False, help="Which models which we allow to generate for us during cluster mode. Can be specified multiple times.")
    parser.add_argument("--revision", help="Specify the model revision for huggingface models (can be a git branch/tag name or a git commit hash)")
    parser.add_argument("--cpu", action='store_true', help="By default unattended launches are on the GPU use this option to force CPU usage.")
    parser.add_argument("--breakmodel", action='store_true', help=argparse.SUPPRESS)
@@ -1335,6 +1339,11 @@ def general_startup(override_args=None):
    vars.model = args.model;
    vars.revision = args.revision

+    if args.apikey:
+        vars.apikey = args.apikey
+    if args.req_model:
+        vars.cluster_requested_models = args.req_model
+
    if args.colab:
        args.remote = True;
        args.override_rename = True;
@@ -1484,7 +1493,7 @@ def get_model_info(model, directory=""):
    

 def get_layer_count(model, directory=""):
-    if(model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]):
+    if(model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]):
        if(model == "GPT2Custom"):
            with open(os.path.join(directory, "config.json"), "r") as f:
                model_config = json.load(f)
@@ -2087,7 +2096,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
    
    
    # If transformers model was selected & GPU available, ask to use CPU or GPU
-    if(vars.model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+    if(vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
        vars.allowsp = True
        # Test for GPU support
        
@@ -2126,7 +2135,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
            print("WARNING: No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)")
            vars.model_type = "gpt_neo"

-    if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+    if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
        loadmodelsettings()
        loadsettings()
        print(2)
@@ -2180,7 +2189,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
        vars.noai = True

    # Start transformers and create pipeline
-    if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+    if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
        if(not vars.noai):
            print("{0}Initializing transformers, please wait...{1}".format(colors.PURPLE, colors.END))
            for m in ("GPTJModel", "XGLMModel"):
@@ -2635,7 +2644,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
            }

        # If we're running Colab or OAI, we still need a tokenizer.
-        if(vars.model in ("Colab", "API")):
+        if(vars.model in ("Colab", "API", "CLUSTER")):
            from transformers import GPT2TokenizerFast
            tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-neo-2.7B", revision=vars.revision, cache_dir="cache")
            loadsettings()
@@ -3281,7 +3290,7 @@ def lua_set_chunk(k, v):
 def lua_get_modeltype():
    if(vars.noai):
        return "readonly"
-    if(vars.model in ("Colab", "API", "OAI", "InferKit")):
+    if(vars.model in ("Colab", "API", "CLUSTER", "OAI", "InferKit")):
        return "api"
    if(not vars.use_colab_tpu and vars.model not in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") and (vars.model in ("GPT2Custom", "NeoCustom") or vars.model_type in ("gpt2", "gpt_neo", "gptj"))):
        hidden_size = get_hidden_size_from_model(model)
@@ -3310,7 +3319,7 @@ def lua_get_modeltype():
 def lua_get_modelbackend():
    if(vars.noai):
        return "readonly"
-    if(vars.model in ("Colab", "API", "OAI", "InferKit")):
+    if(vars.model in ("Colab", "API", "CLUSTER", "OAI", "InferKit")):
        return "api"
    if(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
        return "mtj"
@@ -4033,11 +4042,19 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False,
    while(True):
        set_aibusy(1)

-        if(vars.model == "API"):
+        if(vars.model in ["API","CLUSTER"]):
            global tokenizer
-            tokenizer_id = requests.get(
-                vars.colaburl[:-8] + "/api/v1/model",
-            ).json()["result"]
+            if vars.model == "API":
+                tokenizer_id = requests.get(
+                    vars.colaburl[:-8] + "/api/v1/model",
+                ).json()["result"]
+            elif len(vars.cluster_requested_models) >= 1:
+                # If the player has requested one or more models, we use the first one for the tokenizer
+                tokenizer_id = vars.cluster_requested_models[0]
+            # The cluster can return any number of possible models for each gen, but this happens after this step
+            # So at this point, this is unknown
+            else:
+                tokenizer_id = ""
            if tokenizer_id != vars.api_tokenizer_id:
                try:
                    if(os.path.isdir(tokenizer_id)):
@@ -4283,6 +4300,8 @@ def apiactionsubmit(data, use_memory=False, use_world_info=False, use_story=Fals
        raise NotImplementedError("API generation is not supported in old Colab API mode.")
    elif(vars.model == "API"):
        raise NotImplementedError("API generation is not supported in API mode.")
+    elif(vars.model == "CLUSTER"):
+        raise NotImplementedError("API generation is not supported in API mode.")
    elif(vars.model == "OAI"):
        raise NotImplementedError("API generation is not supported in OpenAI/GooseAI mode.")
    elif(vars.model == "ReadOnly"):
@@ -4333,7 +4352,7 @@ def apiactionsubmit(data, use_memory=False, use_world_info=False, use_story=Fals
    minimum = len(tokens) + 1
    maximum = len(tokens) + vars.genamt

-    if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+    if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "CLUSTER", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
        genout = apiactionsubmit_generate(tokens, minimum, maximum)
    elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
        genout = apiactionsubmit_tpumtjgenerate(tokens, minimum, maximum)
@@ -4501,7 +4520,7 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,

    if(actionlen == 0):
        # First/Prompt action
-        tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns
+        tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns
        assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction
        ln = len(tokens) + lnsp
        return tokens, ln+1, ln+vars.genamt
@@ -4549,12 +4568,12 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None,
        # Did we get to add the A.N.? If not, do it here
        if(anotetxt != ""):
            if((not anoteadded) or forceanote):
-                tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns + tokens
+                tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns + tokens
            else:
-                tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + prompttkns + tokens
+                tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + prompttkns + tokens
        else:
            # Prepend Memory, WI, and Prompt before action tokens
-            tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + prompttkns + tokens
+            tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + prompttkns + tokens

        # Send completed bundle to generator
        assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction
@@ -4576,23 +4595,27 @@ def calcsubmit(txt):
    if(vars.model != "InferKit"):
        subtxt, min, max = calcsubmitbudget(actionlen, winfo, mem, anotetxt, vars.actions, submission=txt)
        if(actionlen == 0):
-            if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+            if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "CLUSTER", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
                generate(subtxt, min, max, found_entries=found_entries)
            elif(vars.model == "Colab"):
                sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
            elif(vars.model == "API"):
                sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
+            elif(vars.model == "CLUSTER"):
+                sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
            elif(vars.model == "OAI"):
                oairequest(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
            elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
                tpumtjgenerate(subtxt, min, max, found_entries=found_entries)
        else:
-            if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
+            if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "CLUSTER", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
                generate(subtxt, min, max, found_entries=found_entries)
            elif(vars.model == "Colab"):
                sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
            elif(vars.model == "API"):
                sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
+            elif(vars.model == "CLUSTER"):
+                sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
            elif(vars.model == "OAI"):
                oairequest(utils.decodenewlines(tokenizer.decode(subtxt)), min, max)
            elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")):
@@ -5072,6 +5095,84 @@ def sendtoapi(txt, min, max):
        set_aibusy(0)
        return

+#==================================================================#
+#  Send transformers-style request to KoboldAI Cluster
+#==================================================================#
+def sendtocluster(txt, min, max):
+    # Log request to console
+    if not vars.quiet:
+        print("{0}Tokens:{1}, Txt:{2}{3}".format(colors.YELLOW, min-1, txt, colors.END))
+
+    # Store context in memory to use it for comparison with generated content
+    vars.lastctx = txt
+
+    # Build request JSON data
+    reqdata = {
+        'max_length': max - min + 1,
+        'max_context_length': vars.max_length,
+        'rep_pen': vars.rep_pen,
+        'rep_pen_slope': vars.rep_pen_slope,
+        'rep_pen_range': vars.rep_pen_range,
+        'temperature': vars.temp,
+        'top_p': vars.top_p,
+        'top_k': vars.top_k,
+        'top_a': vars.top_a,
+        'tfs': vars.tfs,
+        'typical': vars.typical,
+        'n': vars.numseqs,
+    }
+    cluster_metadata = {
+        'prompt': txt,
+        'params': reqdata,
+        'username': vars.apikey,
+        'models': vars.cluster_requested_models,
+    }
+
+    # Create request
+    req = requests.post(
+        vars.colaburl[:-8] + "/generate/sync",
+        json=cluster_metadata,
+    )
+    js = req.json()
+    if(req.status_code == 503):
+        errmsg = "KoboldAI API Error: No available KoboldAI servers found in cluster to fulfil this request using the selected models and requested lengths."
+        print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END))
+        emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True)
+        set_aibusy(0)
+        return
+    if(req.status_code != 200):
+        errmsg = "KoboldAI API Error: Failed to get a reply from the server. Please check the console."
+        print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END))
+        emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True)
+        set_aibusy(0)
+        return
+    genout = js
+
+    for i in range(vars.numseqs):
+        vars.lua_koboldbridge.outputs[i+1] = genout[i]
+
+    execute_outmod()
+    if(vars.lua_koboldbridge.regeneration_required):
+        vars.lua_koboldbridge.regeneration_required = False
+        genout = []
+        for i in range(vars.numseqs):
+            genout.append(vars.lua_koboldbridge.outputs[i+1])
+            assert type(genout[-1]) is str
+
+    if(len(genout) == 1):
+        genresult(genout[0])
+    else:
+        # Convert torch output format to transformers
+        seqs = []
+        for seq in genout:
+            seqs.append({"generated_text": seq})
+        if(vars.lua_koboldbridge.restart_sequence is not None and vars.lua_koboldbridge.restart_sequence > 0):
+            genresult(genout[vars.lua_koboldbridge.restart_sequence-1]["generated_text"])
+        else:
+            genselect(genout)
+
+    set_aibusy(0)
+    return

 #==================================================================#
 #  Send text to TPU mesh transformer backend