Next iteration. Model Loading is broken completely now :)

This commit is contained in:
ebolam
2023-05-11 12:08:35 -04:00
parent 77dd5aa725
commit 4605d10c37
11 changed files with 170 additions and 187 deletions

View File

@@ -645,10 +645,14 @@ def new_socketio_on(*a, **k):
socketio.on = new_socketio_on
def emit(*args, **kwargs):
try:
return _emit(*args, **kwargs)
except AttributeError:
return socketio.emit(*args, **kwargs)
if has_request_context():
try:
return _emit(*args, **kwargs)
except AttributeError:
return socketio.emit(*args, **kwargs)
else: #We're trying to send data outside of the http context. This won't work. Try the relay
if koboldai_settings.queue is not None:
koboldai_settings.queue.put([args[0], args[1], kwargs])
utils.emit = emit
#replacement for tpool.execute to maintain request contexts
@@ -1780,10 +1784,6 @@ def get_cluster_models(msg):
emit('from_server', {'cmd': 'oai_engines', 'data': engines, 'online_model': online_model}, broadcast=True, room="UI_1")
emit('oai_engines', {'data': engines, 'online_model': online_model}, broadcast=False, room="UI_2")
def reset_model_settings():
koboldai_vars.reset_for_model_load()
def unload_model():
global model
@@ -1816,7 +1816,7 @@ def unload_model():
koboldai_vars.badwordsids = koboldai_settings.badwordsids_default
def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False):
def load_model(plugin, initial_load=False):
global model
global tokenizer
global model_config
@@ -1827,79 +1827,18 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
if initial_load:
use_breakmodel_args = True
reset_model_settings()
koboldai_vars.reset_model()
koboldai_vars.cluster_requested_models = [online_model] if isinstance(online_model, str) else online_model
if koboldai_vars.cluster_requested_models == [""]:
koboldai_vars.cluster_requested_models = []
koboldai_vars.noai = False
if not use_breakmodel_args:
set_aibusy(True)
if koboldai_vars.model != 'ReadOnly':
emit('from_server', {'cmd': 'model_load_status', 'data': "Loading {}".format(koboldai_vars.model)}, broadcast=True)
#Have to add a sleep so the server will send the emit for some reason
time.sleep(0.1)
set_aibusy(True)
if koboldai_vars.model != 'ReadOnly':
emit('from_server', {'cmd': 'model_load_status', 'data': "Loading {}".format(koboldai_vars.model)}, broadcast=True)
#Have to add a sleep so the server will send the emit for some reason
time.sleep(0.1)
if gpu_layers is not None:
args.breakmodel_gpulayers = gpu_layers
elif use_breakmodel_args:
gpu_layers = args.breakmodel_gpulayers
if breakmodel_args_default_to_cpu and gpu_layers is None:
gpu_layers = args.breakmodel_gpulayers = []
if disk_layers is not None:
args.breakmodel_disklayers = int(disk_layers)
elif use_breakmodel_args:
disk_layers = args.breakmodel_disklayers
if breakmodel_args_default_to_cpu and disk_layers is None:
disk_layers = args.breakmodel_disklayers = 0
if 'model' in globals():
model.unload()
unload_model()
if online_model == "":
koboldai_vars.configname = getmodelname()
#Let's set the GooseAI or OpenAI server URLs if that's applicable
else:
koboldai_vars.online_model = online_model
# Swap OAI Server if GooseAI was selected
if koboldai_vars.model == "GooseAI":
koboldai_vars.oaiengines = "https://api.goose.ai/v1/engines"
koboldai_vars.model = "OAI"
koboldai_vars.configname = f"GooseAI_{online_model.replace('/', '_')}"
elif koboldai_vars.model == "CLUSTER" and isinstance(online_model, list):
if len(online_model) != 1:
koboldai_vars.configname = koboldai_vars.model
else:
koboldai_vars.configname = f"{koboldai_vars.model}_{online_model[0].replace('/', '_')}"
else:
koboldai_vars.configname = f"{koboldai_vars.model}_{online_model.replace('/', '_')}"
if path.exists(get_config_filename()):
changed=False
with open(get_config_filename(), "r") as file:
# Check if API key exists
js = json.load(file)
if 'online_model' in js:
if js['online_model'] != online_model:
changed=True
js['online_model'] = online_model
else:
changed=True
js['online_model'] = online_model
if changed:
with open("settings/{}.v2_settings".format(koboldai_vars.model), "w") as file:
file.write(json.dumps(js, indent=3))
# Swap OAI Server if GooseAI was selected
if koboldai_vars.model == "GooseAI":
koboldai_vars.oaiengines = "https://api.goose.ai/v1/engines"
koboldai_vars.model = "OAI"
args.configname = "GooseAI" + "/" + online_model
elif koboldai_vars.model != "CLUSTER":
args.configname = koboldai_vars.model + "/" + online_model
koboldai_vars.oaiurl = koboldai_vars.oaiengines + "/{0}/completions".format(online_model)
# If transformers model was selected & GPU available, ask to use CPU or GPU
if(not koboldai_vars.use_colab_tpu and koboldai_vars.model not in ["InferKit", "Colab", "API", "CLUSTER", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]):
@@ -1937,84 +1876,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
else:
koboldai_vars.default_preset = koboldai_settings.default_preset
# Ask for API key if InferKit was selected
if koboldai_vars.model == "InferKit":
koboldai_vars.apikey = koboldai_vars.oaiapikey
# Swap OAI Server if GooseAI was selected
if koboldai_vars.model == "GooseAI":
koboldai_vars.oaiengines = "https://api.goose.ai/v1/engines"
koboldai_vars.model = "OAI"
koboldai_vars.configname = "GooseAI"
# Ask for API key if OpenAI was selected
if koboldai_vars.model == "OAI" and not koboldai_vars.configname:
koboldai_vars.configname = "OAI"
if koboldai_vars.model == "ReadOnly":
koboldai_vars.noai = True
# TODO: InferKit
if koboldai_vars.model == "ReadOnly" or koboldai_vars.noai:
pass
elif koboldai_vars.model in ["Colab", "API", "CLUSTER", "OAI"]:
koboldai_vars.colaburl = url or koboldai_vars.colaburl
koboldai_vars.usegpu = False
koboldai_vars.breakmodel = False
if koboldai_vars.model == "Colab":
from modeling.inference_models.basic_api import model_loader
model = model_loader()
elif koboldai_vars.model == "API":
from modeling.inference_models.api import model_loader
model = model_loader(koboldai_vars.colaburl.replace("/request", ""))
elif koboldai_vars.model == "CLUSTER":
from modeling.inference_models.horde import model_loader
model = model_loader()
elif koboldai_vars.model == "OAI":
from modeling.inference_models.openai import model_loader
model = model_loader()
model.load(initial_load=initial_load)
# TODO: This check sucks, make a model object or somethign
elif "rwkv" in koboldai_vars.model:
if koboldai_vars.use_colab_tpu:
raise RuntimeError("RWKV is not supported on the TPU.")
from modeling.inference_models.rwkv import model_loader
model = model_loader(koboldai_vars.model)
model.load()
elif not koboldai_vars.use_colab_tpu and not koboldai_vars.noai:
# HF Torch
logger.init("Transformers", status='Starting')
for m in ("GPTJModel", "XGLMModel"):
try:
globals()[m] = getattr(__import__("transformers"), m)
except:
pass
from modeling.inference_models.generic_hf_torch import model_loader
model = model_loader(
koboldai_vars.model,
lazy_load=koboldai_vars.lazy_load,
low_mem=args.lowmem
)
model.load(
save_model=not (args.colab or args.cacheonly) or args.savemodel,
initial_load=initial_load,
)
logger.info(f"Pipeline created: {koboldai_vars.model}")
else:
# TPU
from modeling.inference_models.hf_mtj import model_loader
model = model_loader(
koboldai_vars.model
)
model.load(
save_model=not (args.colab or args.cacheonly) or args.savemodel,
initial_load=initial_load,
)
model = model_loaders[plugin]
model.load(initial_load=initial_load)
# TODO: Convert everywhere to use model.tokenizer
if model:
@@ -6532,7 +6396,8 @@ def UI_2_select_model(data):
def UI_2_load_model(data):
logger.info("loading Model")
logger.info(data)
model_loaders[data['plugin']].set_input_parameters(**data)
model_loaders[data['plugin']].set_input_parameters(data)
load_model(data['plugin'])
#load_model(use_gpu=data['use_gpu'], gpu_layers=data['gpu_layers'], disk_layers=data['disk_layers'], online_model=data['online_model'], url=koboldai_vars.colaburl, use_8_bit=data['use_8_bit'])
#==================================================================#
@@ -8155,7 +8020,8 @@ def send_one_time_messages(data, wait_time=0):
# Test
#==================================================================#
def model_info():
if model_config is not None:
global model_config
if 'model_config' in globals() and model_config is not None:
if isinstance(model_config, dict):
if 'model_type' in model_config:
model_type = str(model_config['model_type'])
@@ -11045,7 +10911,7 @@ for schema in config_endpoint_schemas:
def startup():
if koboldai_vars.model == "" or koboldai_vars.model is None:
koboldai_vars.model = "ReadOnly"
socketio.start_background_task(load_model, **{'initial_load':True})
socketio.start_background_task(load_model, *('readonly',), **{'initial_load':True})
print("", end="", flush=True)

View File

@@ -169,6 +169,7 @@ class InferenceModel:
]
self.tokenizer = None
self.capabilties = ModelCapabilities()
self.model_name = "Not Defined"
def is_valid(self, model_name, model_path, menu_path, vram):
return True
@@ -176,7 +177,7 @@ class InferenceModel:
def requested_parameters(self, model_name, model_path, menu_path, vram):
return {}
def define_input_parameters(self):
def set_input_parameters(self, parameters):
return
def load(self, save_model: bool = False, initial_load: bool = False) -> None:
@@ -186,6 +187,9 @@ class InferenceModel:
self._load(save_model=save_model, initial_load=initial_load)
self._post_load()
def unload(self):
return
def _pre_load(self) -> None:
"""Pre load hook. Called before `_load()`."""

View File

@@ -46,8 +46,8 @@ class model_loader(InferenceModel):
})
return requested_parameters
def set_input_parameters(self, base_url=""):
self.base_url = base_url.rstrip("/")
def set_input_parameters(self, parameters):
self.base_url = parameters['base_url'].rstrip("/")
def _load(self, save_model: bool, initial_load: bool) -> None:
tokenizer_id = requests.get(f"{self.base_url}/api/v1/model").json()["result"]

View File

@@ -45,8 +45,8 @@ class model_loader(InferenceModel):
})
return requested_parameters
def set_input_parameters(self, colaburl=""):
self.colaburl = colaburl
def set_input_parameters(self, parameters):
self.colaburl = parameters['colaburl']
def _initialize_model(self):
return

View File

@@ -30,6 +30,7 @@ class model_loader(HFTorchInferenceModel):
def _load(self, save_model: bool, initial_load: bool) -> None:
utils.koboldai_vars.allowsp = True
self.lazy_load = utils.koboldai_vars.lazy_load
# Make model path the same as the model name to make this consistent
# with the other loading method if it isn't a known model type. This

View File

@@ -78,10 +78,10 @@ class model_loader(InferenceModel):
}])
return requested_parameters
def set_input_parameters(self, url="", key="", model=""):
self.key = key.strip()
self.model = model
self.url = url
def set_input_parameters(self, parameters):
self.key = parameters['key'].strip()
self.model = parameters['model']
self.url = parameters['url']
def get_cluster_models(self):
# Get list of models from public cluster

View File

@@ -59,9 +59,9 @@ class model_loader(InferenceModel):
}])
return requested_parameters
def set_input_parameters(self, key="", model=""):
self.key = key.strip()
self.model = model
def set_input_parameters(self, parameters):
self.key = parameters['key'].strip()
self.model = parameters['model']
def get_oai_models(self):
if self.key == "":

View File

@@ -34,12 +34,12 @@ class HFInferenceModel(InferenceModel):
requested_parameters = []
if model_path is not None and os.path.exists(model_path):
model_config = AutoConfig.from_pretrained(model_path)
self.model_config = AutoConfig.from_pretrained(model_path)
elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
else:
model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
layer_count = model_config["n_layer"] if isinstance(model_config, dict) else model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layer if hasattr(model_config, "n_layer") else model_config.num_hidden_layers if hasattr(model_config, 'num_hidden_layers') else None
self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
if layer_count is not None and layer_count >= 0:
if os.path.exists("settings/{}.breakmodel".format(model_name.replace("/", "_"))):
with open("settings/{}.breakmodel".format(model_name.replace("/", "_")), "r") as file:
@@ -61,11 +61,11 @@ class HFInferenceModel(InferenceModel):
"uitype": "slider",
"unit": "int",
"label": "{} Layers".format(torch.cuda.get_device_name(i)),
"id": "{} Layers".format(i),
"id": "{}_Layers".format(i),
"min": 0,
"max": layer_count,
"step": 1,
"check": {"sum": ["{} Layers".format(i) for i in range(gpu_count)]+['CPU Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
"check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
"check_message": "The sum of assigned layers must equal {}".format(layer_count),
"default": break_values[i],
"tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)),
@@ -77,11 +77,11 @@ class HFInferenceModel(InferenceModel):
"uitype": "slider",
"unit": "int",
"label": "CPU Layers",
"id": "CPU Layers",
"id": "CPU_Layers",
"min": 0,
"max": layer_count,
"step": 1,
"check": {"sum": ["{} Layers".format(i) for i in range(gpu_count)]+['CPU Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
"check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
"check_message": "The sum of assigned layers must equal {}".format(layer_count),
"default": layer_count - sum(break_values),
"tooltip": "The number of layers to put on the CPU. This will use your system RAM. It will also do inference partially on CPU. Use if you must.",
@@ -98,7 +98,7 @@ class HFInferenceModel(InferenceModel):
"min": 0,
"max": layer_count,
"step": 1,
"check": {"sum": ["{} Layers".format(i) for i in range(gpu_count)]+['CPU Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
"check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
"check_message": "The sum of assigned layers must equal {}".format(layer_count),
"default": disk_blocks,
"tooltip": "The number of layers to put on the disk. This will use your hard drive. The is VERY slow in comparison to GPU or CPU. Use as a last resort.",
@@ -122,10 +122,40 @@ class HFInferenceModel(InferenceModel):
return requested_parameters
def set_input_parameters(self, layers=[], disk_layers=0, use_gpu=False):
def set_input_parameters(self, parameters):
gpu_count = torch.cuda.device_count()
layers = []
for i in range(gpu_count):
layers.append(int(parameters["{}_Layers".format(i)]) if parameters["{}_Layers".format(i)].isnumeric() else None)
self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None
self.layers = layers
self.disk_layers = disk_layers
self.use_gpu = use_gpu
self.disk_layers = parameters['disk_layers'] if 'disk_layers' in parameters else None
self.use_gpu = parameters['use_gpu'] if 'use_gpu' in parameters else None
self.model_name = parameters['id']
self.path = parameters['path'] if 'path' in parameters else None
def unload(self):
if hasattr(self, 'model'):
self.model = None
if hasattr(self, 'tokenizer'):
self.tokenizer = None
if hasattr(self, 'model_config'):
self.model_config = None
with torch.no_grad():
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated")
for tensor in gc.get_objects():
try:
if torch.is_tensor(tensor):
tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype))
except:
pass
gc.collect()
try:
with torch.no_grad():
torch.cuda.empty_cache()
except:
pass
def _post_load(self) -> None:
# These are model specific tokenizer overrides if a model has bad defaults
@@ -187,7 +217,7 @@ class HFInferenceModel(InferenceModel):
return model_path
basename = utils.koboldai_vars.model.replace("/", "_")
basename = self.model_name.replace("/", "_")
if legacy:
ret = basename
else:

View File

@@ -398,7 +398,7 @@ class HFTorchInferenceModel(HFInferenceModel):
Embedding._koboldai_patch_causallm_model = self.model
def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True):
if not self.lazy_load:
if not utils.koboldai_vars.lazy_load:
return
if utils.args.breakmodel_disklayers is not None:

View File

@@ -0,0 +1,77 @@
from __future__ import annotations
import torch
import requests
import numpy as np
from typing import List, Optional, Union
import utils
from logger import logger
from modeling.inference_model import (
GenerationResult,
GenerationSettings,
InferenceModel,
ModelCapabilities,
)
class BasicAPIException(Exception):
"""To be used for errors when using the Basic API as an interface."""
class model_loader(InferenceModel):
def __init__(self) -> None:
super().__init__()
# Do not allow API to be served over the API
self.capabilties = ModelCapabilities(api_host=False)
self.tokenizer = self._tokenizer()
self.model = None
self.model_name = "Read Only"
def is_valid(self, model_name, model_path, menu_path):
return model_name == "ReadOnly"
def get_requested_parameters(self, model_name, model_path, menu_path):
requested_parameters = []
return requested_parameters
def set_input_parameters(self, parameters):
return
def unload(self):
utils.koboldai_vars.noai = False
def _initialize_model(self):
return
class _tokenizer():
def __init__(self):
self._koboldai_header = []
def decode(self, _input):
return ""
def encode(self, input_text):
return []
def _load(self, save_model: bool = False, initial_load: bool = False) -> None:
self.tokenizer = self.tokenizer
self.model = None
utils.koboldai_vars.noai = True
def _raw_generate(
self,
prompt_tokens: Union[List[int], torch.Tensor],
max_new: int,
gen_settings: GenerationSettings,
single_line: bool = False,
batch_count: int = 1,
seed: Optional[int] = None,
**kwargs,
):
return GenerationResult(
model=self,
out_batches=np.array([]),
prompt=prompt_tokens,
is_whole_generation=True,
single_line=single_line,
)

View File

@@ -14,8 +14,8 @@ socket.on('load_popup', function(data){load_popup(data);});
socket.on('popup_items', function(data){popup_items(data);});
socket.on('popup_breadcrumbs', function(data){popup_breadcrumbs(data);});
socket.on('popup_edit_file', function(data){popup_edit_file(data);});
socket.on('show_model_menu', function(data){show_model_menu(data);});
socket.on('open_model_load_menu', function(data){new_show_model_menu(data);});
//socket.on('show_model_menu', function(data){show_model_menu(data);});
socket.on('open_model_load_menu', function(data){show_model_menu(data);});
socket.on('selected_model_info', function(data){selected_model_info(data);});
socket.on('oai_engines', function(data){oai_engines(data);});
socket.on('buildload', function(data){buildload(data);});
@@ -1502,13 +1502,18 @@ function getModelParameterCount(modelName) {
return base * multiplier;
}
function new_show_model_menu(data) {
function show_model_menu(data) {
//clear out the loadmodelsettings
var loadmodelsettings = document.getElementById('loadmodelsettings')
while (loadmodelsettings.firstChild) {
loadmodelsettings.removeChild(loadmodelsettings.firstChild);
}
document.getElementById("modelplugin").classList.add("hidden");
//Clear out plugin selector
var model_plugin = document.getElementById('modelplugin');
while (model_plugin.firstChild) {
model_plugin.removeChild(model_plugin.firstChild);
}
model_plugin.classList.add("hidden");
var accept = document.getElementById("btn_loadmodelaccept");
accept.disabled = false;