diff --git a/aiserver.py b/aiserver.py index bf4cfac7..a09a0714 100644 --- a/aiserver.py +++ b/aiserver.py @@ -217,6 +217,7 @@ model_menu = { ["InferKit API (requires API key)", "InferKit", "", False], # ["KoboldAI Server API (Old Google Colab)", "Colab", "", False], ["KoboldAI API", "API", "", False], + ["KoboldAI Cluster", "CLUSTER", "", False], ["Return to Main Menu", "mainmenu", "", True], ] } @@ -318,6 +319,7 @@ class vars: colaburl = "" # Ngrok url for Google Colab mode apikey = "" # API key to use for InferKit API calls oaiapikey = "" # API key to use for OpenAI API calls + cluster_requested_models = [] # The models which we allow to generate during cluster mode savedir = getcwd()+"\\stories" hascuda = False # Whether torch has detected CUDA on the system usegpu = False # Whether to launch pipeline with GPU support @@ -1287,6 +1289,8 @@ def general_startup(override_args=None): parser.add_argument("--aria2_port", type=int, help="Specify the port on which aria2's RPC interface will be open if aria2 is installed (defaults to 6799)") parser.add_argument("--model", help="Specify the Model Type to skip the Menu") parser.add_argument("--path", help="Specify the Path for local models (For model NeoCustom or GPT2Custom)") + parser.add_argument("--apikey", help="Specify the API key to use for online services") + parser.add_argument("--req_model", type=str, action='append', required=False, help="Which models which we allow to generate for us during cluster mode. Can be specified multiple times.") parser.add_argument("--revision", help="Specify the model revision for huggingface models (can be a git branch/tag name or a git commit hash)") parser.add_argument("--cpu", action='store_true', help="By default unattended launches are on the GPU use this option to force CPU usage.") parser.add_argument("--breakmodel", action='store_true', help=argparse.SUPPRESS) @@ -1335,6 +1339,11 @@ def general_startup(override_args=None): vars.model = args.model; vars.revision = args.revision + if args.apikey: + vars.apikey = args.apikey + if args.req_model: + vars.cluster_requested_models = args.req_model + if args.colab: args.remote = True; args.override_rename = True; @@ -1479,7 +1488,7 @@ def get_model_info(model, directory=""): def get_layer_count(model, directory=""): - if(model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]): + if(model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]): if(model == "GPT2Custom"): with open(os.path.join(directory, "config.json"), "r") as f: model_config = json.load(f) @@ -2034,7 +2043,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal # If transformers model was selected & GPU available, ask to use CPU or GPU - if(vars.model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): + if(vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): vars.allowsp = True # Test for GPU support @@ -2073,7 +2082,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal print("WARNING: No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)") vars.model_type = "gpt_neo" - if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): + if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): loadmodelsettings() loadsettings() print(2) @@ -2127,7 +2136,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal vars.noai = True # Start transformers and create pipeline - if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): + if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): if(not vars.noai): print("{0}Initializing transformers, please wait...{1}".format(colors.PURPLE, colors.END)) for m in ("GPTJModel", "XGLMModel"): @@ -2582,7 +2591,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal } # If we're running Colab or OAI, we still need a tokenizer. - if(vars.model in ("Colab", "API")): + if(vars.model in ("Colab", "API", "CLUSTER")): from transformers import GPT2TokenizerFast tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-neo-2.7B", revision=vars.revision, cache_dir="cache") loadsettings() @@ -3228,7 +3237,7 @@ def lua_set_chunk(k, v): def lua_get_modeltype(): if(vars.noai): return "readonly" - if(vars.model in ("Colab", "API", "OAI", "InferKit")): + if(vars.model in ("Colab", "API", "CLUSTER", "OAI", "InferKit")): return "api" if(not vars.use_colab_tpu and vars.model not in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX") and (vars.model in ("GPT2Custom", "NeoCustom") or vars.model_type in ("gpt2", "gpt_neo", "gptj"))): hidden_size = get_hidden_size_from_model(model) @@ -3257,7 +3266,7 @@ def lua_get_modeltype(): def lua_get_modelbackend(): if(vars.noai): return "readonly" - if(vars.model in ("Colab", "API", "OAI", "InferKit")): + if(vars.model in ("Colab", "API", "CLUSTER", "OAI", "InferKit")): return "api" if(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")): return "mtj" @@ -3978,11 +3987,19 @@ def actionsubmit(data, actionmode=0, force_submit=False, force_prompt_gen=False, while(True): set_aibusy(1) - if(vars.model == "API"): + if(vars.model in ["API","CLUSTER"]): global tokenizer - tokenizer_id = requests.get( - vars.colaburl[:-8] + "/api/v1/model", - ).json()["result"] + if vars.model == "API": + tokenizer_id = requests.get( + vars.colaburl[:-8] + "/api/v1/model", + ).json()["result"] + elif len(vars.cluster_requested_models) >= 1: + # If the player has requested one or more models, we use the first one for the tokenizer + tokenizer_id = vars.cluster_requested_models[0] + # The cluster can return any number of possible models for each gen, but this happens after this step + # So at this point, this is unknown + else: + tokenizer_id = "" if tokenizer_id != vars.api_tokenizer_id: try: if(os.path.isdir(tokenizer_id)): @@ -4228,6 +4245,8 @@ def apiactionsubmit(data, use_memory=False, use_world_info=False, use_story=Fals raise NotImplementedError("API generation is not supported in old Colab API mode.") elif(vars.model == "API"): raise NotImplementedError("API generation is not supported in API mode.") + elif(vars.model == "CLUSTER"): + raise NotImplementedError("API generation is not supported in API mode.") elif(vars.model == "OAI"): raise NotImplementedError("API generation is not supported in OpenAI/GooseAI mode.") elif(vars.model == "ReadOnly"): @@ -4278,7 +4297,7 @@ def apiactionsubmit(data, use_memory=False, use_world_info=False, use_story=Fals minimum = len(tokens) + 1 maximum = len(tokens) + vars.genamt - if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): + if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "CLUSTER", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): genout = apiactionsubmit_generate(tokens, minimum, maximum) elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")): genout = apiactionsubmit_tpumtjgenerate(tokens, minimum, maximum) @@ -4446,7 +4465,7 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None, if(actionlen == 0): # First/Prompt action - tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns + tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction ln = len(tokens) + lnsp return tokens, ln+1, ln+vars.genamt @@ -4494,12 +4513,12 @@ def calcsubmitbudget(actionlen, winfo, mem, anotetxt, actions, submission=None, # Did we get to add the A.N.? If not, do it here if(anotetxt != ""): if((not anoteadded) or forceanote): - tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns + tokens + tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + anotetkns + prompttkns + tokens else: - tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + prompttkns + tokens + tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + prompttkns + tokens else: # Prepend Memory, WI, and Prompt before action tokens - tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "OAI") else []) + memtokens + witokens + prompttkns + tokens + tokens = (tokenizer._koboldai_header if vars.model not in ("Colab", "API", "CLUSTER", "OAI") else []) + memtokens + witokens + prompttkns + tokens # Send completed bundle to generator assert len(tokens) <= vars.max_length - lnsp - vars.genamt - budget_deduction @@ -4521,23 +4540,27 @@ def calcsubmit(txt): if(vars.model != "InferKit"): subtxt, min, max = calcsubmitbudget(actionlen, winfo, mem, anotetxt, vars.actions, submission=txt) if(actionlen == 0): - if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): + if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "CLUSTER", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): generate(subtxt, min, max, found_entries=found_entries) elif(vars.model == "Colab"): sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) elif(vars.model == "API"): sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) + elif(vars.model == "CLUSTER"): + sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) elif(vars.model == "OAI"): oairequest(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")): tpumtjgenerate(subtxt, min, max, found_entries=found_entries) else: - if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): + if(not vars.use_colab_tpu and vars.model not in ["Colab", "API", "CLUSTER", "OAI", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): generate(subtxt, min, max, found_entries=found_entries) elif(vars.model == "Colab"): sendtocolab(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) elif(vars.model == "API"): sendtoapi(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) + elif(vars.model == "CLUSTER"): + sendtocluster(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) elif(vars.model == "OAI"): oairequest(utils.decodenewlines(tokenizer.decode(subtxt)), min, max) elif(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")): @@ -5017,6 +5040,84 @@ def sendtoapi(txt, min, max): set_aibusy(0) return +#==================================================================# +# Send transformers-style request to KoboldAI Cluster +#==================================================================# +def sendtocluster(txt, min, max): + # Log request to console + if not vars.quiet: + print("{0}Tokens:{1}, Txt:{2}{3}".format(colors.YELLOW, min-1, txt, colors.END)) + + # Store context in memory to use it for comparison with generated content + vars.lastctx = txt + + # Build request JSON data + reqdata = { + 'max_length': max - min + 1, + 'max_context_length': vars.max_length, + 'rep_pen': vars.rep_pen, + 'rep_pen_slope': vars.rep_pen_slope, + 'rep_pen_range': vars.rep_pen_range, + 'temperature': vars.temp, + 'top_p': vars.top_p, + 'top_k': vars.top_k, + 'top_a': vars.top_a, + 'tfs': vars.tfs, + 'typical': vars.typical, + 'n': vars.numseqs, + } + cluster_metadata = { + 'prompt': txt, + 'params': reqdata, + 'username': vars.apikey, + 'models': vars.cluster_requested_models, + } + + # Create request + req = requests.post( + vars.colaburl[:-8] + "/generate/sync", + json=cluster_metadata, + ) + js = req.json() + if(req.status_code == 503): + errmsg = "KoboldAI API Error: No available KoboldAI servers found in cluster to fulfil this request using the selected models and requested lengths." + print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) + emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) + set_aibusy(0) + return + if(req.status_code != 200): + errmsg = "KoboldAI API Error: Failed to get a reply from the server. Please check the console." + print("{0}{1}{2}".format(colors.RED, json.dumps(js, indent=2), colors.END)) + emit('from_server', {'cmd': 'errmsg', 'data': errmsg}, broadcast=True) + set_aibusy(0) + return + genout = js + + for i in range(vars.numseqs): + vars.lua_koboldbridge.outputs[i+1] = genout[i] + + execute_outmod() + if(vars.lua_koboldbridge.regeneration_required): + vars.lua_koboldbridge.regeneration_required = False + genout = [] + for i in range(vars.numseqs): + genout.append(vars.lua_koboldbridge.outputs[i+1]) + assert type(genout[-1]) is str + + if(len(genout) == 1): + genresult(genout[0]) + else: + # Convert torch output format to transformers + seqs = [] + for seq in genout: + seqs.append({"generated_text": seq}) + if(vars.lua_koboldbridge.restart_sequence is not None and vars.lua_koboldbridge.restart_sequence > 0): + genresult(genout[vars.lua_koboldbridge.restart_sequence-1]["generated_text"]) + else: + genselect(genout) + + set_aibusy(0) + return #==================================================================# # Send text to TPU mesh transformer backend