From 4605d10c370b994cfbd1d27891ccae6ade8b9c6b Mon Sep 17 00:00:00 2001 From: ebolam Date: Thu, 11 May 2023 12:08:35 -0400 Subject: [PATCH] Next iteration. Model Loading is broken completely now :) --- aiserver.py | 180 +++--------------- modeling/inference_model.py | 6 +- modeling/inference_models/api.py | 4 +- modeling/inference_models/basic_api.py | 4 +- modeling/inference_models/generic_hf_torch.py | 1 + modeling/inference_models/horde.py | 8 +- modeling/inference_models/openai.py | 6 +- modeling/inference_models/parents/hf.py | 56 ++++-- modeling/inference_models/parents/hf_torch.py | 2 +- modeling/inference_models/readonly.py | 77 ++++++++ static/koboldai.js | 13 +- 11 files changed, 170 insertions(+), 187 deletions(-) create mode 100644 modeling/inference_models/readonly.py diff --git a/aiserver.py b/aiserver.py index ac90d6f4..f9e60641 100644 --- a/aiserver.py +++ b/aiserver.py @@ -645,10 +645,14 @@ def new_socketio_on(*a, **k): socketio.on = new_socketio_on def emit(*args, **kwargs): - try: - return _emit(*args, **kwargs) - except AttributeError: - return socketio.emit(*args, **kwargs) + if has_request_context(): + try: + return _emit(*args, **kwargs) + except AttributeError: + return socketio.emit(*args, **kwargs) + else: #We're trying to send data outside of the http context. This won't work. Try the relay + if koboldai_settings.queue is not None: + koboldai_settings.queue.put([args[0], args[1], kwargs]) utils.emit = emit #replacement for tpool.execute to maintain request contexts @@ -1780,10 +1784,6 @@ def get_cluster_models(msg): emit('from_server', {'cmd': 'oai_engines', 'data': engines, 'online_model': online_model}, broadcast=True, room="UI_1") emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2") - -def reset_model_settings(): - koboldai_vars.reset_for_model_load() - def unload_model(): global model @@ -1816,7 +1816,7 @@ def unload_model(): koboldai_vars.badwordsids = koboldai_settings.badwordsids_default -def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False): +def load_model(plugin, initial_load=False): global model global tokenizer global model_config @@ -1827,79 +1827,18 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal if initial_load: use_breakmodel_args = True - reset_model_settings() koboldai_vars.reset_model() - koboldai_vars.cluster_requested_models = [online_model] if isinstance(online_model, str) else online_model - if koboldai_vars.cluster_requested_models == [""]: - koboldai_vars.cluster_requested_models = [] - koboldai_vars.noai = False - if not use_breakmodel_args: - set_aibusy(True) - if koboldai_vars.model != 'ReadOnly': - emit('from_server', {'cmd': 'model_load_status', 'data': "Loading {}".format(koboldai_vars.model)}, broadcast=True) - #Have to add a sleep so the server will send the emit for some reason - time.sleep(0.1) + set_aibusy(True) + if koboldai_vars.model != 'ReadOnly': + emit('from_server', {'cmd': 'model_load_status', 'data': "Loading {}".format(koboldai_vars.model)}, broadcast=True) + #Have to add a sleep so the server will send the emit for some reason + time.sleep(0.1) - if gpu_layers is not None: - args.breakmodel_gpulayers = gpu_layers - elif use_breakmodel_args: - gpu_layers = args.breakmodel_gpulayers - if breakmodel_args_default_to_cpu and gpu_layers is None: - gpu_layers = args.breakmodel_gpulayers = [] - if disk_layers is not None: - args.breakmodel_disklayers = int(disk_layers) - elif use_breakmodel_args: - disk_layers = args.breakmodel_disklayers - if breakmodel_args_default_to_cpu and disk_layers is None: - disk_layers = args.breakmodel_disklayers = 0 + if 'model' in globals(): + model.unload() - unload_model() - - if online_model == "": - koboldai_vars.configname = getmodelname() - #Let's set the GooseAI or OpenAI server URLs if that's applicable - else: - koboldai_vars.online_model = online_model - # Swap OAI Server if GooseAI was selected - if koboldai_vars.model == "GooseAI": - koboldai_vars.oaiengines = "https://api.goose.ai/v1/engines" - koboldai_vars.model = "OAI" - koboldai_vars.configname = f"GooseAI_{online_model.replace('/', '_')}" - elif koboldai_vars.model == "CLUSTER" and isinstance(online_model, list): - if len(online_model) != 1: - koboldai_vars.configname = koboldai_vars.model - else: - koboldai_vars.configname = f"{koboldai_vars.model}_{online_model[0].replace('/', '_')}" - else: - koboldai_vars.configname = f"{koboldai_vars.model}_{online_model.replace('/', '_')}" - - if path.exists(get_config_filename()): - changed=False - with open(get_config_filename(), "r") as file: - # Check if API key exists - js = json.load(file) - if 'online_model' in js: - if js['online_model'] != online_model: - changed=True - js['online_model'] = online_model - else: - changed=True - js['online_model'] = online_model - - if changed: - with open("settings/{}.v2_settings".format(koboldai_vars.model), "w") as file: - file.write(json.dumps(js, indent=3)) - - # Swap OAI Server if GooseAI was selected - if koboldai_vars.model == "GooseAI": - koboldai_vars.oaiengines = "https://api.goose.ai/v1/engines" - koboldai_vars.model = "OAI" - args.configname = "GooseAI" + "/" + online_model - elif koboldai_vars.model != "CLUSTER": - args.configname = koboldai_vars.model + "/" + online_model - koboldai_vars.oaiurl = koboldai_vars.oaiengines + "/{0}/completions".format(online_model) # If transformers model was selected & GPU available, ask to use CPU or GPU if(not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]): @@ -1937,84 +1876,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal else: koboldai_vars.default_preset = koboldai_settings.default_preset - - # Ask for API key if InferKit was selected - if koboldai_vars.model == "InferKit": - koboldai_vars.apikey = koboldai_vars.oaiapikey - # Swap OAI Server if GooseAI was selected - if koboldai_vars.model == "GooseAI": - koboldai_vars.oaiengines = "https://api.goose.ai/v1/engines" - koboldai_vars.model = "OAI" - koboldai_vars.configname = "GooseAI" - - # Ask for API key if OpenAI was selected - if koboldai_vars.model == "OAI" and not koboldai_vars.configname: - koboldai_vars.configname = "OAI" - - if koboldai_vars.model == "ReadOnly": - koboldai_vars.noai = True - - # TODO: InferKit - if koboldai_vars.model == "ReadOnly" or koboldai_vars.noai: - pass - elif koboldai_vars.model in ["Colab", "API", "CLUSTER", "OAI"]: - koboldai_vars.colaburl = url or koboldai_vars.colaburl - koboldai_vars.usegpu = False - koboldai_vars.breakmodel = False - - if koboldai_vars.model == "Colab": - from modeling.inference_models.basic_api import model_loader - model = model_loader() - elif koboldai_vars.model == "API": - from modeling.inference_models.api import model_loader - model = model_loader(koboldai_vars.colaburl.replace("/request", "")) - elif koboldai_vars.model == "CLUSTER": - from modeling.inference_models.horde import model_loader - model = model_loader() - elif koboldai_vars.model == "OAI": - from modeling.inference_models.openai import model_loader - model = model_loader() - - model.load(initial_load=initial_load) - # TODO: This check sucks, make a model object or somethign - elif "rwkv" in koboldai_vars.model: - if koboldai_vars.use_colab_tpu: - raise RuntimeError("RWKV is not supported on the TPU.") - from modeling.inference_models.rwkv import model_loader - model = model_loader(koboldai_vars.model) - model.load() - elif not koboldai_vars.use_colab_tpu and not koboldai_vars.noai: - # HF Torch - logger.init("Transformers", status='Starting') - for m in ("GPTJModel", "XGLMModel"): - try: - globals()[m] = getattr(__import__("transformers"), m) - except: - pass - - from modeling.inference_models.generic_hf_torch import model_loader - model = model_loader( - koboldai_vars.model, - lazy_load=koboldai_vars.lazy_load, - low_mem=args.lowmem - ) - - model.load( - save_model=not (args.colab or args.cacheonly) or args.savemodel, - initial_load=initial_load, - ) - logger.info(f"Pipeline created: {koboldai_vars.model}") - else: - # TPU - from modeling.inference_models.hf_mtj import model_loader - model = model_loader( - koboldai_vars.model - ) - model.load( - save_model=not (args.colab or args.cacheonly) or args.savemodel, - initial_load=initial_load, - ) + model = model_loaders[plugin] + model.load(initial_load=initial_load) # TODO: Convert everywhere to use model.tokenizer if model: @@ -6532,7 +6396,8 @@ def UI_2_select_model(data): def UI_2_load_model(data): logger.info("loading Model") logger.info(data) - model_loaders[data['plugin']].set_input_parameters(**data) + model_loaders[data['plugin']].set_input_parameters(data) + load_model(data['plugin']) #load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit']) #==================================================================# @@ -8155,7 +8020,8 @@ def send_one_time_messages(data, wait_time=0): # Test #==================================================================# def model_info(): - if model_config is not None: + global model_config + if 'model_config' in globals() and model_config is not None: if isinstance(model_config, dict): if 'model_type' in model_config: model_type = str(model_config['model_type']) @@ -11045,7 +10911,7 @@ for schema in config_endpoint_schemas: def startup(): if koboldai_vars.model == "" or koboldai_vars.model is None: koboldai_vars.model = "ReadOnly" - socketio.start_background_task(load_model, **{'initial_load':True}) + socketio.start_background_task(load_model, *('readonly',), **{'initial_load':True}) print("", end="", flush=True) diff --git a/modeling/inference_model.py b/modeling/inference_model.py index 27ad46db..343eb39a 100644 --- a/modeling/inference_model.py +++ b/modeling/inference_model.py @@ -169,6 +169,7 @@ class InferenceModel: ] self.tokenizer = None self.capabilties = ModelCapabilities() + self.model_name = "Not Defined" def is_valid(self, model_name, model_path, menu_path, vram): return True @@ -176,7 +177,7 @@ class InferenceModel: def requested_parameters(self, model_name, model_path, menu_path, vram): return {} - def define_input_parameters(self): + def set_input_parameters(self, parameters): return def load(self, save_model: bool = False, initial_load: bool = False) -> None: @@ -186,6 +187,9 @@ class InferenceModel: self._load(save_model=save_model, initial_load=initial_load) self._post_load() + def unload(self): + return + def _pre_load(self) -> None: """Pre load hook. Called before `_load()`.""" diff --git a/modeling/inference_models/api.py b/modeling/inference_models/api.py index 41088bc7..5bddd714 100644 --- a/modeling/inference_models/api.py +++ b/modeling/inference_models/api.py @@ -46,8 +46,8 @@ class model_loader(InferenceModel): }) return requested_parameters - def set_input_parameters(self, base_url=""): - self.base_url = base_url.rstrip("/") + def set_input_parameters(self, parameters): + self.base_url = parameters['base_url'].rstrip("/") def _load(self, save_model: bool, initial_load: bool) -> None: tokenizer_id = requests.get(f"{self.base_url}/api/v1/model").json()["result"] diff --git a/modeling/inference_models/basic_api.py b/modeling/inference_models/basic_api.py index d7fc0863..5666ba8e 100644 --- a/modeling/inference_models/basic_api.py +++ b/modeling/inference_models/basic_api.py @@ -45,8 +45,8 @@ class model_loader(InferenceModel): }) return requested_parameters - def set_input_parameters(self, colaburl=""): - self.colaburl = colaburl + def set_input_parameters(self, parameters): + self.colaburl = parameters['colaburl'] def _initialize_model(self): return diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py index 366fbbb7..b542c712 100644 --- a/modeling/inference_models/generic_hf_torch.py +++ b/modeling/inference_models/generic_hf_torch.py @@ -30,6 +30,7 @@ class model_loader(HFTorchInferenceModel): def _load(self, save_model: bool, initial_load: bool) -> None: utils.koboldai_vars.allowsp = True + self.lazy_load = utils.koboldai_vars.lazy_load # Make model path the same as the model name to make this consistent # with the other loading method if it isn't a known model type. This diff --git a/modeling/inference_models/horde.py b/modeling/inference_models/horde.py index f02cf265..057669d7 100644 --- a/modeling/inference_models/horde.py +++ b/modeling/inference_models/horde.py @@ -78,10 +78,10 @@ class model_loader(InferenceModel): }]) return requested_parameters - def set_input_parameters(self, url="", key="", model=""): - self.key = key.strip() - self.model = model - self.url = url + def set_input_parameters(self, parameters): + self.key = parameters['key'].strip() + self.model = parameters['model'] + self.url = parameters['url'] def get_cluster_models(self): # Get list of models from public cluster diff --git a/modeling/inference_models/openai.py b/modeling/inference_models/openai.py index 01c0c037..efbb01d3 100644 --- a/modeling/inference_models/openai.py +++ b/modeling/inference_models/openai.py @@ -59,9 +59,9 @@ class model_loader(InferenceModel): }]) return requested_parameters - def set_input_parameters(self, key="", model=""): - self.key = key.strip() - self.model = model + def set_input_parameters(self, parameters): + self.key = parameters['key'].strip() + self.model = parameters['model'] def get_oai_models(self): if self.key == "": diff --git a/modeling/inference_models/parents/hf.py b/modeling/inference_models/parents/hf.py index 54781296..3099feaf 100644 --- a/modeling/inference_models/parents/hf.py +++ b/modeling/inference_models/parents/hf.py @@ -34,12 +34,12 @@ class HFInferenceModel(InferenceModel): requested_parameters = [] if model_path is not None and os.path.exists(model_path): - model_config = AutoConfig.from_pretrained(model_path) + self.model_config = AutoConfig.from_pretrained(model_path) elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))): - model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache") + self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache") else: - model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache") - layer_count = model_config["n_layer"] if isinstance(model_config, dict) else model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layer if hasattr(model_config, "n_layer") else model_config.num_hidden_layers if hasattr(model_config, 'num_hidden_layers') else None + self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache") + layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None if layer_count is not None and layer_count >= 0: if os.path.exists("settings/{}.breakmodel".format(model_name.replace("/", "_"))): with open("settings/{}.breakmodel".format(model_name.replace("/", "_")), "r") as file: @@ -61,11 +61,11 @@ class HFInferenceModel(InferenceModel): "uitype": "slider", "unit": "int", "label": "{} Layers".format(torch.cuda.get_device_name(i)), - "id": "{} Layers".format(i), + "id": "{}_Layers".format(i), "min": 0, "max": layer_count, "step": 1, - "check": {"sum": ["{} Layers".format(i) for i in range(gpu_count)]+['CPU Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="}, + "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="}, "check_message": "The sum of assigned layers must equal {}".format(layer_count), "default": break_values[i], "tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)), @@ -77,11 +77,11 @@ class HFInferenceModel(InferenceModel): "uitype": "slider", "unit": "int", "label": "CPU Layers", - "id": "CPU Layers", + "id": "CPU_Layers", "min": 0, "max": layer_count, "step": 1, - "check": {"sum": ["{} Layers".format(i) for i in range(gpu_count)]+['CPU Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="}, + "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="}, "check_message": "The sum of assigned layers must equal {}".format(layer_count), "default": layer_count - sum(break_values), "tooltip": "The number of layers to put on the CPU. This will use your system RAM. It will also do inference partially on CPU. Use if you must.", @@ -98,7 +98,7 @@ class HFInferenceModel(InferenceModel): "min": 0, "max": layer_count, "step": 1, - "check": {"sum": ["{} Layers".format(i) for i in range(gpu_count)]+['CPU Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="}, + "check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="}, "check_message": "The sum of assigned layers must equal {}".format(layer_count), "default": disk_blocks, "tooltip": "The number of layers to put on the disk. This will use your hard drive. The is VERY slow in comparison to GPU or CPU. Use as a last resort.", @@ -122,10 +122,40 @@ class HFInferenceModel(InferenceModel): return requested_parameters - def set_input_parameters(self, layers=[], disk_layers=0, use_gpu=False): + def set_input_parameters(self, parameters): + gpu_count = torch.cuda.device_count() + layers = [] + for i in range(gpu_count): + layers.append(int(parameters["{}_Layers".format(i)]) if parameters["{}_Layers".format(i)].isnumeric() else None) + self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None self.layers = layers - self.disk_layers = disk_layers - self.use_gpu = use_gpu + self.disk_layers = parameters['disk_layers'] if 'disk_layers' in parameters else None + self.use_gpu = parameters['use_gpu'] if 'use_gpu' in parameters else None + self.model_name = parameters['id'] + self.path = parameters['path'] if 'path' in parameters else None + + def unload(self): + if hasattr(self, 'model'): + self.model = None + if hasattr(self, 'tokenizer'): + self.tokenizer = None + if hasattr(self, 'model_config'): + self.model_config = None + with torch.no_grad(): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated") + for tensor in gc.get_objects(): + try: + if torch.is_tensor(tensor): + tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype)) + except: + pass + gc.collect() + try: + with torch.no_grad(): + torch.cuda.empty_cache() + except: + pass def _post_load(self) -> None: # These are model specific tokenizer overrides if a model has bad defaults @@ -187,7 +217,7 @@ class HFInferenceModel(InferenceModel): return model_path - basename = utils.koboldai_vars.model.replace("/", "_") + basename = self.model_name.replace("/", "_") if legacy: ret = basename else: diff --git a/modeling/inference_models/parents/hf_torch.py b/modeling/inference_models/parents/hf_torch.py index d8afafb1..4de13d7b 100644 --- a/modeling/inference_models/parents/hf_torch.py +++ b/modeling/inference_models/parents/hf_torch.py @@ -398,7 +398,7 @@ class HFTorchInferenceModel(HFInferenceModel): Embedding._koboldai_patch_causallm_model = self.model def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True): - if not self.lazy_load: + if not utils.koboldai_vars.lazy_load: return if utils.args.breakmodel_disklayers is not None: diff --git a/modeling/inference_models/readonly.py b/modeling/inference_models/readonly.py new file mode 100644 index 00000000..c642c05a --- /dev/null +++ b/modeling/inference_models/readonly.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import torch +import requests +import numpy as np +from typing import List, Optional, Union + +import utils +from logger import logger +from modeling.inference_model import ( + GenerationResult, + GenerationSettings, + InferenceModel, + ModelCapabilities, +) + + +class BasicAPIException(Exception): + """To be used for errors when using the Basic API as an interface.""" + + +class model_loader(InferenceModel): + def __init__(self) -> None: + super().__init__() + + # Do not allow API to be served over the API + self.capabilties = ModelCapabilities(api_host=False) + self.tokenizer = self._tokenizer() + self.model = None + self.model_name = "Read Only" + + def is_valid(self, model_name, model_path, menu_path): + return model_name == "ReadOnly" + + def get_requested_parameters(self, model_name, model_path, menu_path): + requested_parameters = [] + return requested_parameters + + def set_input_parameters(self, parameters): + return + + def unload(self): + utils.koboldai_vars.noai = False + + def _initialize_model(self): + return + + class _tokenizer(): + def __init__(self): + self._koboldai_header = [] + def decode(self, _input): + return "" + def encode(self, input_text): + return [] + + def _load(self, save_model: bool = False, initial_load: bool = False) -> None: + self.tokenizer = self.tokenizer + self.model = None + utils.koboldai_vars.noai = True + + def _raw_generate( + self, + prompt_tokens: Union[List[int], torch.Tensor], + max_new: int, + gen_settings: GenerationSettings, + single_line: bool = False, + batch_count: int = 1, + seed: Optional[int] = None, + **kwargs, + ): + return GenerationResult( + model=self, + out_batches=np.array([]), + prompt=prompt_tokens, + is_whole_generation=True, + single_line=single_line, + ) diff --git a/static/koboldai.js b/static/koboldai.js index 1907add8..7f004ff2 100644 --- a/static/koboldai.js +++ b/static/koboldai.js @@ -14,8 +14,8 @@ socket.on('load_popup', function(data){load_popup(data);}); socket.on('popup_items', function(data){popup_items(data);}); socket.on('popup_breadcrumbs', function(data){popup_breadcrumbs(data);}); socket.on('popup_edit_file', function(data){popup_edit_file(data);}); -socket.on('show_model_menu', function(data){show_model_menu(data);}); -socket.on('open_model_load_menu', function(data){new_show_model_menu(data);}); +//socket.on('show_model_menu', function(data){show_model_menu(data);}); +socket.on('open_model_load_menu', function(data){show_model_menu(data);}); socket.on('selected_model_info', function(data){selected_model_info(data);}); socket.on('oai_engines', function(data){oai_engines(data);}); socket.on('buildload', function(data){buildload(data);}); @@ -1502,13 +1502,18 @@ function getModelParameterCount(modelName) { return base * multiplier; } -function new_show_model_menu(data) { +function show_model_menu(data) { //clear out the loadmodelsettings var loadmodelsettings = document.getElementById('loadmodelsettings') while (loadmodelsettings.firstChild) { loadmodelsettings.removeChild(loadmodelsettings.firstChild); } - document.getElementById("modelplugin").classList.add("hidden"); + //Clear out plugin selector + var model_plugin = document.getElementById('modelplugin'); + while (model_plugin.firstChild) { + model_plugin.removeChild(model_plugin.firstChild); + } + model_plugin.classList.add("hidden"); var accept = document.getElementById("btn_loadmodelaccept"); accept.disabled = false;