From 69d942c00cfd16708f82826fcc0d50355e322c0f Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 11 May 2023 20:22:30 -0400 Subject: [PATCH] Kind of working breakmodel --- aiserver.py | 256 +----------------- koboldai_settings.py | 3 +- modeling/inference_models/generic_hf_torch.py | 7 +- modeling/inference_models/gooseai.py | 31 +++ modeling/inference_models/hf_mtj.py | 2 +- modeling/inference_models/openai.py | 168 +----------- modeling/inference_models/parents/hf.py | 35 ++- modeling/inference_models/parents/hf_torch.py | 27 +- .../parents/openai_gooseai.py | 189 +++++++++++++ static/koboldai.js | 6 + 10 files changed, 281 insertions(+), 443 deletions(-) create mode 100644 modeling/inference_models/gooseai.py create mode 100644 modeling/inference_models/parents/openai_gooseai.py diff --git a/aiserver.py b/aiserver.py index f9e60641..158a6699 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1473,7 +1473,7 @@ def general_startup(override_args=None): koboldai_vars.quiet = True if args.nobreakmodel: - koboldai_vars.nobreakmodel = True + model_loaders['generic_hf_torch'].nobreakmodel = True if args.remote: koboldai_vars.host = True; @@ -1484,6 +1484,9 @@ def general_startup(override_args=None): if args.localtunnel: koboldai_vars.host = True; + if args.lowmem: + model_loaders['generic_hf_torch'].low_mem = True + if args.host != "Disabled": # This means --host option was submitted without an argument # Enable all LAN IPs (0.0.0.0/0) @@ -1516,6 +1519,9 @@ def general_startup(override_args=None): koboldai_vars.trust_remote_code = True if args.cpu: koboldai_vars.use_colab_tpu = False + koboldai_vars.hascuda = False + koboldai_vars.usegpu = False + model_loaders['generic_hf_torch'].nobreakmodel = True koboldai_vars.smandelete = koboldai_vars.host == args.override_delete koboldai_vars.smanrename = koboldai_vars.host == args.override_rename @@ -1545,245 +1551,6 @@ def general_startup(override_args=None): socketio.start_background_task(socket_io_relay, koboldai_settings.queue, socketio) -#==================================================================# -# Load Model -#==================================================================# - -@socketio.on("get_model_info") -def get_model_info(model, directory=""): - logger.info("Selected: {}, {}".format(model, directory)) - # if the model is in the api list - disk_blocks = 0 - key = False - breakmodel = False - gpu = False - layer_count = None - key_value = "" - break_values = [] - url = False - default_url = None - models_on_url = False - multi_online_models = False - show_online_model_select=False - gpu_count = torch.cuda.device_count() - gpu_names = [] - send_horde_models = False - show_custom_model_box = False - for i in range(gpu_count): - gpu_names.append(torch.cuda.get_device_name(i)) - if model in ['Colab', 'API']: - url = True - elif model == 'CLUSTER': - models_on_url = True - show_online_model_select=True - url = True - key = True - default_url = koboldai_vars.horde_url - multi_online_models = True - key_value = koboldai_vars.horde_api_key - url = koboldai_vars.horde_url - if key_value: - send_horde_models = True - elif model in [x.name for x in model_menu['apilist']]: - show_online_model_select=True - if path.exists("settings/{}.v2_settings".format(model)): - with open("settings/{}.v2_settings".format(model), "r") as file: - # Check if API key exists - try: - js = json.load(file) - - if("apikey" in js and js["apikey"] != ""): - # API key exists, grab it and close the file - key_value = js["apikey"] - elif 'oaiapikey' in js and js['oaiapikey'] != "": - key_value = js["oaiapikey"] - if model in ('GooseAI', 'OAI'): - get_oai_models({'model': model, 'key': key_value}) - except json.decoder.JSONDecodeError: - print(":(") - pass - key = True - elif "rwkv" in model.lower(): - pass - elif model == 'ReadOnly': - pass - #elif model == 'customhuggingface': - # show_custom_model_box = True - elif args.cpu: - pass - else: - layer_count = get_layer_count(model, directory=directory) - if layer_count is None: - breakmodel = False - gpu = True - else: - breakmodel = True - if model in ["NeoCustom", "GPT2Custom", "customhuggingface"]: - filename = "settings/{}.breakmodel".format(os.path.basename(os.path.normpath(directory))) - else: - filename = "settings/{}.breakmodel".format(model.replace("/", "_")) - if path.exists(filename): - with open(filename, "r") as file: - data = [x for x in file.read().split("\n")[:2] if x != ''] - if len(data) < 2: - data.append("0") - break_values, disk_blocks = data - break_values = break_values.split(",") - else: - break_values = [layer_count] - break_values = [int(x) for x in break_values if x != ''] - break_values += [0] * (gpu_count - len(break_values)) - emit('from_server', {'cmd': 'selected_model_info', 'key_value': key_value, 'key':key, 'multi_online_models': multi_online_models, 'default_url': default_url, - 'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel, - 'disk_break_value': disk_blocks, 'accelerate': True, - 'break_values': break_values, 'gpu_count': gpu_count, - 'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, - 'show_custom_model_box': show_custom_model_box}, broadcast=True, room="UI_1") - emit('selected_model_info', {'key_value': key_value, 'key':key, - 'gpu':gpu, 'layer_count':layer_count, 'breakmodel':breakmodel, 'multi_online_models': multi_online_models, 'default_url': default_url, - 'disk_break_value': disk_blocks, 'disk_break': True, - 'break_values': break_values, 'gpu_count': gpu_count, - 'url': url, 'gpu_names': gpu_names, 'models_on_url': models_on_url, 'show_online_model_select': show_online_model_select, - 'bit_8_available': koboldai_vars.bit_8_available if koboldai_vars.experimental_features else False, - 'show_custom_model_box': show_custom_model_box}) - if send_horde_models: - get_cluster_models({'key': key_value, 'url': default_url}) - elif key_value != "" and model in [x.name for x in model_menu['apilist']] and model != 'CLUSTER': - get_oai_models(key_value) - - - -def get_layer_count(model, directory=""): - if(model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]): - if(model == "GPT2Custom"): - with open(os.path.join(directory, "config.json"), "r") as f: - model_config = json.load(f) - # Get the model_type from the config or assume a model type if it isn't present - else: - if(directory): - model = directory - from transformers import AutoConfig - if(os.path.isdir(model.replace('/', '_'))): - model_config = AutoConfig.from_pretrained(model.replace('/', '_'), revision=koboldai_vars.revision, cache_dir="cache") - elif(is_model_downloaded(model)): - model_config = AutoConfig.from_pretrained("models/{}".format(model.replace('/', '_')), revision=koboldai_vars.revision, cache_dir="cache") - elif(os.path.isdir(directory)): - model_config = AutoConfig.from_pretrained(directory, revision=koboldai_vars.revision, cache_dir="cache") - elif(os.path.isdir(koboldai_vars.custmodpth.replace('/', '_'))): - model_config = AutoConfig.from_pretrained(koboldai_vars.custmodpth.replace('/', '_'), revision=koboldai_vars.revision, cache_dir="cache") - else: - model_config = AutoConfig.from_pretrained(model, revision=koboldai_vars.revision, cache_dir="cache") - try: - if (model_config.model_type != 'gpt2' or model_config.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not koboldai_vars.nobreakmodel: - return utils.num_layers(model_config) - else: - return None - except: - return None - else: - return None - -@socketio.on('OAI_Key_Update') -def get_oai_models(data): - key = data['key'] - model = data['model'] - koboldai_vars.oaiapikey = key - if model == 'OAI': - url = "https://api.openai.com/v1/engines" - elif model == 'GooseAI': - url = "https://api.goose.ai/v1/engines" - else: - return - - # Get list of models from OAI - logger.init("OAI Engines", status="Retrieving") - req = requests.get( - url, - headers = { - 'Authorization': 'Bearer '+key - } - ) - if(req.status_code == 200): - r = req.json() - engines = r["data"] - try: - engines = [[en["id"], "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")] for en in engines] - except: - logger.error(engines) - raise - - online_model = "" - changed=False - - #Save the key - if not path.exists("settings"): - # If the client settings file doesn't exist, create it - # Write API key to file - os.makedirs('settings', exist_ok=True) - if path.exists("settings/{}.v2_settings".format(model)): - with open("settings/{}.v2_settings".format(model), "r") as file: - js = json.load(file) - if 'online_model' in js: - online_model = js['online_model'] - if "apikey" in js: - if js['apikey'] != key: - changed=True - else: - js = {} - changed=True - - if changed: - with open("settings/{}.v2_settings".format(model), "w") as file: - js["apikey"] = key - file.write(json.dumps(js, indent=3)) - - logger.init_ok("OAI Engines", status="OK") - emit('from_server', {'cmd': 'oai_engines', 'data': engines, 'online_model': online_model}, broadcast=True, room="UI_1") - emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2") - else: - # Something went wrong, print the message and quit since we can't initialize an engine - logger.init_err("OAI Engines", status="Failed") - logger.error(req.json()) - emit('from_server', {'cmd': 'errmsg', 'data': req.json()}) - -@socketio.on("get_cluster_models") -def get_cluster_models(msg): - koboldai_vars.horde_api_key = msg['key'] or koboldai_vars.horde_api_key - url = msg['url'] or koboldai_vars.horde_url - koboldai_vars.horde_url = url - # Get list of models from public cluster - print("{0}Retrieving engine list...{1}".format(colors.PURPLE, colors.END), end="") - try: - req = requests.get(f"{url}/api/v2/status/models?type=text") - except: - logger.init_err("KAI Horde Models", status="Failed") - logger.error("Provided KoboldAI Horde URL unreachable") - emit('from_server', {'cmd': 'errmsg', 'data': "Provided KoboldAI Horde URL unreachable"}) - return - if not req.ok: - # Something went wrong, print the message and quit since we can't initialize an engine - logger.init_err("KAI Horde Models", status="Failed") - logger.error(req.json()) - emit('from_server', {'cmd': 'errmsg', 'data': req.json()}, room="UI_1") - return - - engines = req.json() - logger.debug(engines) - try: - engines = [[en["name"], en["name"]] for en in engines] - except: - logger.error(engines) - raise - logger.debug(engines) - - online_model = "" - savesettings() - - logger.init_ok("KAI Horde Models", status="OK") - - emit('from_server', {'cmd': 'oai_engines', 'data': engines, 'online_model': online_model}, broadcast=True, room="UI_1") - emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2") - def unload_model(): global model @@ -1845,7 +1612,6 @@ def load_model(plugin, initial_load=False): # loadmodelsettings() # loadsettings() logger.init("GPU support", status="Searching") - koboldai_vars.hascuda = torch.cuda.is_available() and not args.cpu koboldai_vars.bmsupported = ((koboldai_vars.model_type != 'gpt2') or koboldai_vars.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not koboldai_vars.nobreakmodel if(args.breakmodel is not None and args.breakmodel): logger.warning("--breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --breakmodel_gpulayers is used (see --help for details).") @@ -1861,12 +1627,7 @@ def load_model(plugin, initial_load=False): else: logger.init_warn("GPU support", status="Not Found") - if args.cpu: - koboldai_vars.usegpu = False - gpu_layers = None - disk_layers = None - koboldai_vars.breakmodel = False - elif koboldai_vars.hascuda: + if koboldai_vars.hascuda: if(koboldai_vars.bmsupported): koboldai_vars.usegpu = False koboldai_vars.breakmodel = True @@ -1879,6 +1640,7 @@ def load_model(plugin, initial_load=False): model = model_loaders[plugin] model.load(initial_load=initial_load) + logger.debug("Model Type: {}".format(koboldai_vars.model_type)) # TODO: Convert everywhere to use model.tokenizer if model: diff --git a/koboldai_settings.py b/koboldai_settings.py index d8416df2..e9562ffc 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -710,7 +710,6 @@ class model_settings(settings): self.modeldim = -1 # Embedding dimension of your model (e.g. it's 4096 for GPT-J-6B and 2560 for GPT-Neo-2.7B) self.sampler_order = [6, 0, 1, 2, 3, 4, 5] self.newlinemode = "n" - self.lazy_load = True # Whether or not to use torch_lazy_loader.py for transformers models in order to reduce CPU memory usage self.presets = [] # Holder for presets self.selected_preset = "" self.uid_presets = [] @@ -1236,7 +1235,7 @@ class system_settings(settings): self.corescript = "default.lua" # Filename of corescript to load self.gpu_device = 0 # Which PyTorch device to use when using pure GPU generation self.savedir = os.getcwd()+"\\stories" - self.hascuda = False # Whether torch has detected CUDA on the system + self.hascuda = torch.cuda.is_available() # Whether torch has detected CUDA on the system self.usegpu = False # Whether to launch pipeline with GPU support self.splist = [] self.spselect = "" # Temporary storage for soft prompt filename to load diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py index d5cf6397..c228e2ee 100644 --- a/modeling/inference_models/generic_hf_torch.py +++ b/modeling/inference_models/generic_hf_torch.py @@ -30,7 +30,6 @@ class model_loader(HFTorchInferenceModel): def _load(self, save_model: bool, initial_load: bool) -> None: utils.koboldai_vars.allowsp = True - self.lazy_load = utils.koboldai_vars.lazy_load # Make model path the same as the model name to make this consistent # with the other loading method if it isn't a known model type. This @@ -69,12 +68,14 @@ class model_loader(HFTorchInferenceModel): # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors + logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel)) if ( self.lazy_load and utils.koboldai_vars.hascuda - and utils.koboldai_vars.breakmodel - and not utils.koboldai_vars.nobreakmodel + and self.breakmodel + and not self.nobreakmodel ): + logger.debug("loading breakmodel") self.breakmodel_device_config(self.model_config) if self.lazy_load: diff --git a/modeling/inference_models/gooseai.py b/modeling/inference_models/gooseai.py new file mode 100644 index 00000000..08d8ea06 --- /dev/null +++ b/modeling/inference_models/gooseai.py @@ -0,0 +1,31 @@ +import torch +import requests +import numpy as np +from typing import List, Optional, Union + +import utils +from logger import logger +from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, +) + +from modeling.inference_models.parents.openai_gooseai import model_loader as openai_gooseai_model_loader + + + +class OpenAIAPIError(Exception): + def __init__(self, error_type: str, error_message) -> None: + super().__init__(f"{error_type}: {error_message}") + + +class model_loader(openai_gooseai_model_loader): + """InferenceModel for interfacing with OpenAI's generation API.""" + + def __init__(self): + super().__init__() + self.url = "https://api.goose.ai/v1/engines" + + def is_valid(self, model_name, model_path, menu_path): + return model_name == "GooseAI" \ No newline at end of file diff --git a/modeling/inference_models/hf_mtj.py b/modeling/inference_models/hf_mtj.py index c99e9a05..759feb65 100644 --- a/modeling/inference_models/hf_mtj.py +++ b/modeling/inference_models/hf_mtj.py @@ -27,7 +27,7 @@ class model_loader(HFInferenceModel): #model_name: str, ) -> None: super().__init__() - + self.hf_torch = False self.model_config = None self.capabilties = ModelCapabilities( embedding_manipulation=False, diff --git a/modeling/inference_models/openai.py b/modeling/inference_models/openai.py index efbb01d3..cad2a7f2 100644 --- a/modeling/inference_models/openai.py +++ b/modeling/inference_models/openai.py @@ -11,6 +11,8 @@ from modeling.inference_model import ( InferenceModel, ) +from modeling.inference_models.parents.openai_gooseai import model_loader as openai_gooseai_model_loader + class OpenAIAPIError(Exception): @@ -18,172 +20,12 @@ class OpenAIAPIError(Exception): super().__init__(f"{error_type}: {error_message}") -class model_loader(InferenceModel): +class model_loader(openai_gooseai_model_loader): """InferenceModel for interfacing with OpenAI's generation API.""" def __init__(self): super().__init__() - self.key = "" + self.url = "https://api.openai.com/v1/engines" def is_valid(self, model_name, model_path, menu_path): - return model_name == "OAI" or model_name == "GooseAI" - - def get_requested_parameters(self, model_name, model_path, menu_path): - self.source = model_name - requested_parameters = [] - requested_parameters.extend([{ - "uitype": "text", - "unit": "text", - "label": "Key", - "id": "key", - "default": "", - "check": {"value": "", 'check': "!="}, - "tooltip": "User Key to use when connecting to OpenAI/GooseAI.", - "menu_path": "", - "refresh_model_inputs": True, - "extra_classes": "" - }, - { - "uitype": "dropdown", - "unit": "text", - "label": "Model", - "id": "model", - "default": "", - "check": {"value": "", 'check': "!="}, - "tooltip": "Which model to use when running OpenAI/GooseAI.", - "menu_path": "", - "refresh_model_inputs": False, - "extra_classes": "", - 'children': self.get_oai_models(), - - }]) - return requested_parameters - - def set_input_parameters(self, parameters): - self.key = parameters['key'].strip() - self.model = parameters['model'] - - def get_oai_models(self): - if self.key == "": - return [] - if self.source == 'OAI': - url = "https://api.openai.com/v1/engines" - elif self.source == 'GooseAI': - url = "https://api.goose.ai/v1/engines" - else: - return - - # Get list of models from OAI - logger.init("OAI Engines", status="Retrieving") - req = requests.get( - url, - headers = { - 'Authorization': 'Bearer '+self.key - } - ) - if(req.status_code == 200): - r = req.json() - engines = r["data"] - try: - engines = [{"value": en["id"], "text": "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")} for en in engines] - except: - logger.error(engines) - raise - - online_model = "" - - - logger.init_ok("OAI Engines", status="OK") - return engines - else: - # Something went wrong, print the message and quit since we can't initialize an engine - logger.init_err("OAI Engines", status="Failed") - logger.error(req.json()) - emit('from_server', {'cmd': 'errmsg', 'data': req.json()}) - return [] - - - def _load(self, save_model: bool, initial_load: bool) -> None: - self.tokenizer = self._get_tokenizer("gpt2") - - def _raw_generate( - self, - prompt_tokens: Union[List[int], torch.Tensor], - max_new: int, - gen_settings: GenerationSettings, - single_line: bool = False, - batch_count: int = 1, - seed: Optional[int] = None, - **kwargs, - ) -> GenerationResult: - - if seed is not None: - logger.warning( - "Seed is unsupported on the OpenAIAPIInferenceModel. Seed will be ignored." - ) - - decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) - - # Store context in memory to use it for comparison with generated content - utils.koboldai_vars.lastctx = decoded_prompt - - # Build request JSON data - # GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround - # as the koboldai_vars.model will always be OAI - if "GooseAI" in utils.koboldai_vars.configname: - reqdata = { - "prompt": decoded_prompt, - "max_tokens": max_new, - "temperature": gen_settings.temp, - "top_a": gen_settings.top_a, - "top_p": gen_settings.top_p, - "top_k": gen_settings.top_k, - "tfs": gen_settings.tfs, - "typical_p": gen_settings.typical, - "repetition_penalty": gen_settings.rep_pen, - "repetition_penalty_slope": gen_settings.rep_pen_slope, - "repetition_penalty_range": gen_settings.rep_pen_range, - "n": batch_count, - # TODO: Implement streaming - "stream": False, - } - else: - reqdata = { - "prompt": decoded_prompt, - "max_tokens": max_new, - "temperature": gen_settings.temp, - "top_p": gen_settings.top_p, - "frequency_penalty": gen_settings.rep_pen, - "n": batch_count, - "stream": False, - } - - req = requests.post( - utils.koboldai_vars.oaiurl, - json=reqdata, - headers={ - "Authorization": "Bearer " + utils.koboldai_vars.oaiapikey, - "Content-Type": "application/json", - }, - ) - - j = req.json() - - if not req.ok: - # Send error message to web client - if "error" in j: - error_type = j["error"]["type"] - error_message = j["error"]["message"] - else: - error_type = "Unknown" - error_message = "Unknown" - raise OpenAIAPIError(error_type, error_message) - - outputs = [out["text"] for out in j["choices"]] - return GenerationResult( - model=self, - out_batches=np.array([self.tokenizer.encode(x) for x in outputs]), - prompt=prompt_tokens, - is_whole_generation=True, - single_line=single_line, - ) + return model_name == "OAI" \ No newline at end of file diff --git a/modeling/inference_models/parents/hf.py b/modeling/inference_models/parents/hf.py index 1941a12e..c7a781d7 100644 --- a/modeling/inference_models/parents/hf.py +++ b/modeling/inference_models/parents/hf.py @@ -22,18 +22,19 @@ class HFInferenceModel(InferenceModel): def is_valid(self, model_name, model_path, menu_path): try: if model_path is not None and os.path.exists(model_path): - model_config = AutoConfig.from_pretrained(model_path) + self.model_config = AutoConfig.from_pretrained(model_path) elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))): - model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache") + self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache") else: - model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache") + self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache") return True except: return False def get_requested_parameters(self, model_name, model_path, menu_path): requested_parameters = [] - + if not self.hf_torch: + return [] if model_path is not None and os.path.exists(model_path): self.model_config = AutoConfig.from_pretrained(model_path) elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))): @@ -124,14 +125,20 @@ class HFInferenceModel(InferenceModel): return requested_parameters def set_input_parameters(self, parameters): - gpu_count = torch.cuda.device_count() - layers = [] - for i in range(gpu_count): - layers.append(int(parameters["{}_Layers".format(i)]) if parameters["{}_Layers".format(i)].isnumeric() else None) - self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None - self.layers = layers - self.disk_layers = parameters['disk_layers'] if 'disk_layers' in parameters else None - self.use_gpu = parameters['use_gpu'] if 'use_gpu' in parameters else None + if self.hf_torch: + import breakmodel + gpu_count = torch.cuda.device_count() + layers = [] + for i in range(gpu_count): + layers.append(int(parameters["{}_Layers".format(i)]) if parameters["{}_Layers".format(i)].isnumeric() else None) + self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None + self.layers = layers + self.disk_layers = int(parameters['disk_layers']) if 'disk_layers' in parameters and parameters['disk_layers'].isnumeric() else 0 + breakmodel.gpu_blocks = layers + breakmodel.disk_blocks = self.disk_layers + self.use_gpu = parameters['use_gpu'] if 'use_gpu' in parameters else None + self.model_type = self.get_model_type() + self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel self.model_name = parameters['id'] self.path = parameters['path'] if 'path' in parameters else None @@ -157,6 +164,10 @@ class HFInferenceModel(InferenceModel): torch.cuda.empty_cache() except: pass + if self.hf_torch: + breakmodel.breakmodel = True + breakmodel.gpu_blocks = [] + breakmodel.disk_blocks = 0 def _post_load(self) -> None: # These are model specific tokenizer overrides if a model has bad defaults diff --git a/modeling/inference_models/parents/hf_torch.py b/modeling/inference_models/parents/hf_torch.py index 7cc16ad5..84c60a6c 100644 --- a/modeling/inference_models/parents/hf_torch.py +++ b/modeling/inference_models/parents/hf_torch.py @@ -53,15 +53,12 @@ LOG_SAMPLER_NO_EFFECT = False class HFTorchInferenceModel(HFInferenceModel): - def __init__( - self, - #model_name: str, - #lazy_load: bool, - #low_mem: bool, - ) -> None: + def __init__(self) -> None: super().__init__() - #self.lazy_load = lazy_load - #self.low_mem = low_mem + self.hf_torch = True + self.lazy_load = True + self.low_mem = False + self.nobreakmodel = False self.post_token_hooks = [ PostTokenHooks.stream_tokens, @@ -398,7 +395,7 @@ class HFTorchInferenceModel(HFInferenceModel): Embedding._koboldai_patch_causallm_model = self.model def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True): - if not utils.koboldai_vars.lazy_load: + if not self.lazy_load: return if utils.args.breakmodel_disklayers is not None: @@ -819,14 +816,14 @@ class HFTorchInferenceModel(HFInferenceModel): elif ( utils.args.breakmodel_gpulayers is not None or utils.args.breakmodel_disklayers is not None + or breakmodel.gpu_blocks != [] ): try: - if not utils.args.breakmodel_gpulayers: - breakmodel.gpu_blocks = [] - else: - breakmodel.gpu_blocks = list( - map(int, utils.args.breakmodel_gpulayers.split(",")) - ) + if breakmodel.gpu_blocks == []: + if utils.args.breakmodel_gpulayers: + breakmodel.gpu_blocks = list( + map(int, utils.args.breakmodel_gpulayers.split(",")) + ) assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count() s = n_layers for i in range(len(breakmodel.gpu_blocks)): diff --git a/modeling/inference_models/parents/openai_gooseai.py b/modeling/inference_models/parents/openai_gooseai.py new file mode 100644 index 00000000..621ccbad --- /dev/null +++ b/modeling/inference_models/parents/openai_gooseai.py @@ -0,0 +1,189 @@ +import torch +import requests +import numpy as np +from typing import List, Optional, Union + +import utils +from logger import logger +from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, +) + + + +class OpenAIAPIError(Exception): + def __init__(self, error_type: str, error_message) -> None: + super().__init__(f"{error_type}: {error_message}") + + +class model_loader(InferenceModel): + """InferenceModel for interfacing with OpenAI's generation API.""" + + def __init__(self): + super().__init__() + self.key = "" + self.url = "https://api.goose.ai/v1/engines" + #if self.source == 'OAI': + # url = "https://api.openai.com/v1/engines" + #elif self.source == 'GooseAI': + # url = "https://api.goose.ai/v1/engines" + + def is_valid(self, model_name, model_path, menu_path): + return model_name == "OAI" or model_name == "GooseAI" + + def get_requested_parameters(self, model_name, model_path, menu_path): + self.source = model_name + requested_parameters = [] + requested_parameters.extend([{ + "uitype": "text", + "unit": "text", + "label": "Key", + "id": "key", + "default": "", + "check": {"value": "", 'check': "!="}, + "tooltip": "User Key to use when connecting to OpenAI/GooseAI.", + "menu_path": "", + "refresh_model_inputs": True, + "extra_classes": "" + }, + { + "uitype": "dropdown", + "unit": "text", + "label": "Model", + "id": "model", + "default": "", + "check": {"value": "", 'check': "!="}, + "tooltip": "Which model to use when running OpenAI/GooseAI.", + "menu_path": "", + "refresh_model_inputs": False, + "extra_classes": "", + 'children': self.get_oai_models(), + + }]) + return requested_parameters + + def set_input_parameters(self, parameters): + self.key = parameters['key'].strip() + self.model = parameters['model'] + + def get_oai_models(self): + if self.key == "": + return [] + + + # Get list of models from OAI + logger.init("OAI Engines", status="Retrieving") + req = requests.get( + self.url, + headers = { + 'Authorization': 'Bearer '+self.key + } + ) + if(req.status_code == 200): + r = req.json() + engines = r["data"] + try: + engines = [{"value": en["id"], "text": "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")} for en in engines] + except: + logger.error(engines) + raise + + online_model = "" + + + logger.init_ok("OAI Engines", status="OK") + return engines + else: + # Something went wrong, print the message and quit since we can't initialize an engine + logger.init_err("OAI Engines", status="Failed") + logger.error(req.json()) + emit('from_server', {'cmd': 'errmsg', 'data': req.json()}) + return [] + + + def _load(self, save_model: bool, initial_load: bool) -> None: + self.tokenizer = self._get_tokenizer("gpt2") + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + seed: Optional[int] = None, + **kwargs, + ) -> GenerationResult: + + if seed is not None: + logger.warning( + "Seed is unsupported on the OpenAIAPIInferenceModel. Seed will be ignored." + ) + + decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens)) + + # Store context in memory to use it for comparison with generated content + utils.koboldai_vars.lastctx = decoded_prompt + + # Build request JSON data + # GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround + # as the koboldai_vars.model will always be OAI + if "GooseAI" in utils.koboldai_vars.configname: + reqdata = { + "prompt": decoded_prompt, + "max_tokens": max_new, + "temperature": gen_settings.temp, + "top_a": gen_settings.top_a, + "top_p": gen_settings.top_p, + "top_k": gen_settings.top_k, + "tfs": gen_settings.tfs, + "typical_p": gen_settings.typical, + "repetition_penalty": gen_settings.rep_pen, + "repetition_penalty_slope": gen_settings.rep_pen_slope, + "repetition_penalty_range": gen_settings.rep_pen_range, + "n": batch_count, + # TODO: Implement streaming + "stream": False, + } + else: + reqdata = { + "prompt": decoded_prompt, + "max_tokens": max_new, + "temperature": gen_settings.temp, + "top_p": gen_settings.top_p, + "frequency_penalty": gen_settings.rep_pen, + "n": batch_count, + "stream": False, + } + + req = requests.post( + self.url, + json=reqdata, + headers={ + "Authorization": "Bearer " + self.key, + "Content-Type": "application/json", + }, + ) + + j = req.json() + + if not req.ok: + # Send error message to web client + if "error" in j: + error_type = j["error"]["type"] + error_message = j["error"]["message"] + else: + error_type = "Unknown" + error_message = "Unknown" + raise OpenAIAPIError(error_type, error_message) + + outputs = [out["text"] for out in j["choices"]] + return GenerationResult( + model=self, + out_batches=np.array([self.tokenizer.encode(x) for x in outputs]), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) diff --git a/static/koboldai.js b/static/koboldai.js index 7f004ff2..ab7f7832 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -1652,6 +1652,12 @@ function selected_model_info(data) { while (loadmodelsettings.firstChild) { loadmodelsettings.removeChild(loadmodelsettings.firstChild); } + //Clear out plugin selector + var model_plugin = document.getElementById('modelplugin'); + while (model_plugin.firstChild) { + model_plugin.removeChild(model_plugin.firstChild); + } + var accept = document.getElementById("btn_loadmodelaccept"); accept.disabled = false;