Moved model backends to separate folders

added some model backend settings save/load
This commit is contained in:
ebolam
2023-05-18 20:14:33 -04:00
parent 4040538d34
commit 06f59a7b7b
14 changed files with 69 additions and 409 deletions

View File

@@ -626,9 +626,9 @@ import importlib
model_backend_code = {}
model_backends = {}
for module in os.listdir("./modeling/inference_models"):
if os.path.isfile(os.path.join("./modeling/inference_models",module)) and module[-3:] == '.py':
model_backend_code[module[:-3]] = importlib.import_module('modeling.inference_models.{}'.format(module[:-3]))
model_backends[model_backend_code[module[:-3]].model_backend_name] = model_backend_code[module[:-3]].model_backend()
if not os.path.isfile(os.path.join("./modeling/inference_models",module)) and module != '__pycache__':
model_backend_code[module] = importlib.import_module('modeling.inference_models.{}.class'.format(module))
model_backends[model_backend_code[module].model_backend_name] = model_backend_code[module].model_backend()
old_socketio_on = socketio.on

View File

@@ -188,6 +188,7 @@ class InferenceModel:
self._pre_load()
self._load(save_model=save_model, initial_load=initial_load)
self._post_load()
self._save_settings()
def unload(self):
return
@@ -198,6 +199,9 @@ class InferenceModel:
def _post_load(self) -> None:
"""Post load hook. Called after `_load()`."""
def _save_settings(self) -> None:
"""Save settings hook. Called after `_post_load()`."""
def _load(self, save_model: bool, initial_load: bool) -> None:
"""Main load method. All logic related to loading the model onto the
selected device(s) and preparing it for inference should be implemented here."""

View File

@@ -26,19 +26,22 @@ class APIException(Exception):
class model_backend(InferenceModel):
def __init__(self) -> None:
super().__init__()
#self.base_url = ""
self.base_url = ""
def is_valid(self, model_name, model_path, menu_path):
return model_name == "API"
def get_requested_parameters(self, model_name, model_path, menu_path):
if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self):
with open("settings/api.model_backend.settings", "r") as f:
self.base_url = json.load(f)['base_url']
requested_parameters = []
requested_parameters.append({
"uitype": "text",
"unit": "text",
"label": "URL",
"id": "base_url",
"default": False,
"default": self.base_url,
"check": {"value": "", 'check': "!="},
"tooltip": "The URL of the KoboldAI API to connect to.",
"menu_path": "",
@@ -58,6 +61,10 @@ class model_backend(InferenceModel):
# Do not allow API to be served over the API
self.capabilties = ModelCapabilities(api_host=False)
def _save_settings(self):
with open("settings/api.model_backend.settings", "w") as f:
json.dump({"base_url": self.base_url}, f, indent="")
def _raw_generate(
self,
prompt_tokens: Union[List[int], torch.Tensor],

View File

@@ -24,6 +24,7 @@ class BasicAPIException(Exception):
class model_backend(InferenceModel):
def __init__(self) -> None:
super().__init__()
self.colaburl = ""
# Do not allow API to be served over the API
self.capabilties = ModelCapabilities(api_host=False)
@@ -32,13 +33,16 @@ class model_backend(InferenceModel):
return model_name == "Colab"
def get_requested_parameters(self, model_name, model_path, menu_path):
if os.path.exists("settings/api.model_backend.settings") and 'colaburl' not in vars(self):
with open("settings/api.model_backend.settings", "r") as f:
self.colaburl = json.load(f)['base_url']
requested_parameters = []
requested_parameters.append({
"uitype": "text",
"unit": "text",
"label": "URL",
"id": "colaburl",
"default": False,
"default": self.colaburl,
"check": {"value": "", 'check': "!="},
"tooltip": "The URL of the Colab KoboldAI API to connect to.",
"menu_path": "",
@@ -56,6 +60,10 @@ class model_backend(InferenceModel):
def _load(self, save_model: bool, initial_load: bool) -> None:
self.tokenizer = self._get_tokenizer("EleutherAI/gpt-neo-2.7B")
def _save_settings(self):
with open("settings/basic_api.model_backend.settings", "w") as f:
json.dump({"colaburl": self.colaburl}, f, indent="")
def _raw_generate(
self,
prompt_tokens: Union[List[int], torch.Tensor],

View File

@@ -20,7 +20,7 @@ except ModuleNotFoundError as e:
if not utils.koboldai_vars.use_colab_tpu:
raise e
from modeling.inference_models.parents.hf_torch import HFTorchInferenceModel
from modeling.inference_models.hf_torch import HFTorchInferenceModel
model_backend_name = "Huggingface"
@@ -270,3 +270,7 @@ class model_backend(HFTorchInferenceModel):
self.model.kai_model = self
utils.koboldai_vars.modeldim = self.get_hidden_size()
def _save_settings(self):
with open("settings/{}.generic_hf_torch.model_backend.settings".format(self.model_name.replace("/", "_")), "w") as f:
json.dump({"layers": self.layers if 'layers' in vars(self) else [], "disk_layers": self.disk_layers if 'disk_layers' in vars(self) else 0}, f, indent="")

View File

@@ -11,14 +11,14 @@ from modeling.inference_model import (
InferenceModel,
)
from modeling.inference_models.parents.openai_gooseai import model_backend as openai_gooseai_model_backend
from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend
model_backend_name = "GooseAI"
class OpenAIAPIError(Exception):
def __init__(self, error_type: str, error_message) -> None:
super().__init__(f"{error_type}: {error_message}")
self.source = "GooseAI"
class model_backend(openai_gooseai_model_backend):

View File

@@ -3,6 +3,7 @@ from typing import Optional
from transformers import AutoConfig
import warnings
import utils
import json
import koboldai_settings
from logger import logger
from modeling.inference_model import InferenceModel
@@ -44,16 +45,15 @@ class HFInferenceModel(InferenceModel):
self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
if os.path.exists("settings/{}.breakmodel".format(model_name.replace("/", "_"))):
with open("settings/{}.breakmodel".format(model_name.replace("/", "_")), "r") as file:
data = [x for x in file.read().split("\n")[:2] if x != '']
if len(data) < 2:
data.append("0")
break_values, disk_blocks = data
break_values = break_values.split(",")
if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
temp = json.load(f)
break_values = temp['layers'] if 'layers' in temp else [layer_count]
disk_blocks = temp['disk_layers'] if 'disk_layers' in temp else 0
else:
break_values = [layer_count]
disk_blocks = 0
break_values = [int(x) for x in break_values if x != '' and x is not None]
gpu_count = torch.cuda.device_count()
break_values += [0] * (gpu_count - len(break_values))
@@ -132,8 +132,15 @@ class HFInferenceModel(InferenceModel):
if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
gpu_count = torch.cuda.device_count()
layers = []
logger.info(parameters)
for i in range(gpu_count):
layers.append(int(parameters["{}_Layers".format(i)]) if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric() else None)
logger.info(parameters["{}_Layers".format(i)])
if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric():
layers.append(int(parameters["{}_Layers".format(i)]))
elif isinstance(parameters["{}_Layers".format(i)], str):
layers.append(None)
else:
layers.append(parameters["{}_Layers".format(i)])
self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None
if isinstance(self.cpu_layers, str):
self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0

View File

@@ -16,7 +16,7 @@ from modeling.inference_model import (
GenerationSettings,
ModelCapabilities,
)
from modeling.inference_models.parents.hf import HFInferenceModel
from modeling.inference_models.hf import HFInferenceModel
from modeling.tokenizer import GenericTokenizer
model_backend_name = "Huggingface MTJ"

View File

@@ -31,7 +31,7 @@ from modeling import warpers
from modeling.warpers import Warper
from modeling.stoppers import Stoppers
from modeling.post_token_hooks import PostTokenHooks
from modeling.inference_models.parents.hf import HFInferenceModel
from modeling.inference_models.hf import HFInferenceModel
from modeling.inference_model import (
GenerationResult,
GenerationSettings,
@@ -823,135 +823,10 @@ class HFTorchInferenceModel(HFInferenceModel):
breakmodel.gpu_blocks = [0] * n_layers
return
elif (
utils.args.breakmodel_gpulayers is not None
or utils.args.breakmodel_disklayers is not None
or breakmodel.gpu_blocks != []
):
try:
if breakmodel.gpu_blocks == []:
if utils.args.breakmodel_gpulayers:
breakmodel.gpu_blocks = list(
map(int, utils.args.breakmodel_gpulayers.split(","))
)
assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
s = n_layers
for i in range(len(breakmodel.gpu_blocks)):
if breakmodel.gpu_blocks[i] <= -1:
breakmodel.gpu_blocks[i] = s
break
else:
s -= breakmodel.gpu_blocks[i]
assert sum(breakmodel.gpu_blocks) <= n_layers
n_layers -= sum(breakmodel.gpu_blocks)
n_layers -= breakmodel.disk_blocks
except:
logger.warning(
"--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0."
)
breakmodel.gpu_blocks = [n_layers]
n_layers = 0
elif utils.args.breakmodel_layers is not None:
breakmodel.gpu_blocks = [
n_layers - max(0, min(n_layers, utils.args.breakmodel_layers))
]
n_layers -= sum(breakmodel.gpu_blocks)
elif utils.args.model is not None:
elif breakmodel.gpu_blocks != []:
logger.info("Breakmodel not specified, assuming GPU 0")
breakmodel.gpu_blocks = [n_layers]
n_layers = 0
else:
device_count = torch.cuda.device_count()
if device_count > 1:
print(
Colors.CYAN
+ "\nPlease select one of your GPUs to be your primary GPU."
)
print(
"VRAM usage in your primary GPU will be higher than for your other ones."
)
print("It is recommended you make your fastest GPU your primary GPU.")
self.breakmodel_device_list(n_layers)
while True:
primaryselect = input("device ID> ")
if (
primaryselect.isnumeric()
and 0 <= int(primaryselect) < device_count
):
breakmodel.primary_device = int(primaryselect)
break
else:
print(
f"{Colors.RED}Please enter an integer between 0 and {device_count-1}.{Colors.END}"
)
else:
breakmodel.primary_device = 0
print(
Colors.PURPLE
+ "\nIf you don't have enough VRAM to run the model on a single GPU"
)
print(
"you can split the model between your CPU and your GPU(s), or between"
)
print("multiple GPUs if you have more than one.")
print("By putting more 'layers' on a GPU or CPU, more computations will be")
print(
"done on that device and more VRAM or RAM will be required on that device"
)
print("(roughly proportional to number of layers).")
print(
"It should be noted that GPUs are orders of magnitude faster than the CPU."
)
print(
f"This model has{Colors.YELLOW} {n_layers} {Colors.PURPLE}layers.{Colors.END}\n"
)
for i in range(device_count):
self.breakmodel_device_list(
n_layers, primary=breakmodel.primary_device, selected=i
)
print(
f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
)
while True:
layerselect = input("# of layers> ")
if (
layerselect.isnumeric() or layerselect.strip() == "-1"
) and -1 <= int(layerselect) <= n_layers:
layerselect = int(layerselect)
layerselect = n_layers if layerselect == -1 else layerselect
breakmodel.gpu_blocks.append(layerselect)
n_layers -= layerselect
break
else:
print(
f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
)
if n_layers == 0:
break
if n_layers > 0:
self.breakmodel_device_list(
n_layers, primary=breakmodel.primary_device, selected=-1
)
print(
f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
)
while True:
layerselect = input("# of layers> ")
if (
layerselect.isnumeric() or layerselect.strip() == "-1"
) and -1 <= int(layerselect) <= n_layers:
layerselect = int(layerselect)
layerselect = n_layers if layerselect == -1 else layerselect
breakmodel.disk_blocks = layerselect
n_layers -= layerselect
break
else:
print(
f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
)
logger.init_ok("Final device configuration:", status="Info")
self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)

View File

@@ -38,6 +38,11 @@ class model_backend(InferenceModel):
return model_name == "CLUSTER" or model_name in [x['value'] for x in self.models]
def get_requested_parameters(self, model_name, model_path, menu_path):
if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self):
with open("settings/horde.model_backend.settings", "r") as f:
temp = json.load(f)
self.base_url = temp['url']
self.key = temp['key']
requested_parameters = []
requested_parameters.extend([{
"uitype": "text",
@@ -122,6 +127,10 @@ class model_backend(InferenceModel):
#else "gpt2",
)
def _save_settings(self):
with open("settings/horde.model_backend.settings", "w") as f:
json.dump({"key": self.key, "url": self.url}, f, indent="")
def _raw_generate(
self,
prompt_tokens: Union[List[int], torch.Tensor],

View File

@@ -11,13 +11,14 @@ from modeling.inference_model import (
InferenceModel,
)
from modeling.inference_models.parents.openai_gooseai import model_backend as openai_gooseai_model_backend
from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend
model_backend_name = "OpenAI"
class OpenAIAPIError(Exception):
def __init__(self, error_type: str, error_message) -> None:
super().__init__(f"{error_type}: {error_message}")
self.source = "OpenAI"
class model_backend(openai_gooseai_model_backend):

View File

@@ -25,15 +25,14 @@ class model_backend(InferenceModel):
super().__init__()
self.key = ""
self.url = "https://api.goose.ai/v1/engines"
#if self.source == 'OAI':
# url = "https://api.openai.com/v1/engines"
#elif self.source == 'GooseAI':
# url = "https://api.goose.ai/v1/engines"
def is_valid(self, model_name, model_path, menu_path):
return model_name == "OAI" or model_name == "GooseAI"
def get_requested_parameters(self, model_name, model_path, menu_path):
if os.path.exists("settings/{}.model_backend.settings".format(self.source)) and 'colaburl' not in vars(self):
with open("settings/{}.model_backend.settings".format(self.source), "r") as f:
self.key = json.load(f)['key']
self.source = model_name
requested_parameters = []
requested_parameters.extend([{
@@ -41,7 +40,7 @@ class model_backend(InferenceModel):
"unit": "text",
"label": "Key",
"id": "key",
"default": "",
"default": self.key,
"check": {"value": "", 'check': "!="},
"tooltip": "User Key to use when connecting to OpenAI/GooseAI.",
"menu_path": "",
@@ -106,6 +105,10 @@ class model_backend(InferenceModel):
def _load(self, save_model: bool, initial_load: bool) -> None:
self.tokenizer = self._get_tokenizer("gpt2")
def _save_settings(self):
with open("settings/{}.model_backend.settings".format(self.source), "w") as f:
json.dump({"key": self.key}, f, indent="")
def _raw_generate(
self,
prompt_tokens: Union[List[int], torch.Tensor],

View File

@@ -1,258 +0,0 @@
from __future__ import annotations
import os
import time
from typing import Dict, List, Optional, Union
import numpy as np
import requests
from tokenizers import Tokenizer
from tqdm import tqdm
from huggingface_hub import hf_hub_url
import torch
from torch.nn import functional as F
# Must be defined before import
os.environ["RWKV_JIT_ON"] = "1"
# TODO: Include compiled kernel
os.environ["RWKV_CUDA_ON"] = "1"
import utils
from logger import logger
from modeling import warpers
from modeling.warpers import Warper
from modeling.stoppers import Stoppers
from modeling.post_token_hooks import PostTokenHooks
from modeling.tokenizer import GenericTokenizer
from modeling.inference_model import (
GenerationResult,
GenerationSettings,
InferenceModel,
ModelCapabilities,
)
TOKENIZER_URL = (
"https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/20B_tokenizer.json"
)
TOKENIZER_PATH = "models/rwkv/20b_tokenizer.json"
REPO_OWNER = "BlinkDL"
MODEL_FILES = {
"rwkv-4-pile-14b": "RWKV-4-Pile-14B-20230213-8019.pth",
# NOTE: Still in progress(?)
"rwkv-4-pile-14b:ctx4096": "RWKV-4-Pile-14B-20230228-ctx4096-test663.pth",
"rwkv-4-pile-7b": "RWKV-4-Pile-7B-20221115-8047.pth",
"rwkv-4-pile-7b:ctx4096": "RWKV-4-Pile-7B-20230109-ctx4096.pth",
"rwkv-4-pile-3b": "RWKV-4-Pile-3B-20221008-8023.pth",
"rwkv-4-pile-3b:ctx4096": "RWKV-4-Pile-3B-20221110-ctx4096.pth",
"rwkv-4-pile-1b5": "RWKV-4-Pile-1B5-20220903-8040.pth",
"rwkv-4-pile-1b5:ctx4096": "RWKV-4-Pile-1B5-20220929-ctx4096.pth",
"rwkv-4-pile-430m": "RWKV-4-Pile-430M-20220808-8066.pth",
"rwkv-4-pile-169m": "RWKV-4-Pile-169M-20220807-8023.pth",
}
model_backend_name = "RWKV"
class model_backend(InferenceModel):
def __init__(
self,
#model_name: str,
) -> None:
super().__init__()
#self.model_name = model_name
self.post_token_hooks = [
PostTokenHooks.stream_tokens,
]
self.stopper_hooks = [
Stoppers.core_stopper,
Stoppers.dynamic_wi_scanner,
Stoppers.singleline_stopper,
Stoppers.chat_mode_stopper,
Stoppers.stop_sequence_stopper,
]
self.capabilties = ModelCapabilities(
embedding_manipulation=False,
post_token_hooks=True,
stopper_hooks=True,
post_token_probs=True,
)
self._old_stopping_criteria = None
def is_valid(self, model_name, model_path, menu_path):
try:
from rwkv.model import RWKV
valid = True
except:
valid = False
return valid and "rwkv" in model_name.lower()
def get_requested_parameters(self, model_name, model_path, menu_path):
self.source = model_name
requested_parameters = []
return requested_parameters
def set_input_parameters(self):
return
def _ensure_directory_structure(self) -> None:
for path in ["models/rwkv", "models/rwkv/models"]:
try:
os.mkdir(path)
except FileExistsError:
pass
def _get_tokenizer(self) -> GenericTokenizer:
if not os.path.exists(TOKENIZER_PATH):
logger.info("RWKV tokenizer not found, downloading...")
r = requests.get(TOKENIZER_URL)
with open(TOKENIZER_PATH, "wb") as file:
file.write(r.content)
return GenericTokenizer(Tokenizer.from_file(TOKENIZER_PATH))
def _download_model(self, model_path: str, model_class: str) -> None:
logger.info(f"{self.model_name} not found, downloading...")
url = hf_hub_url(
repo_id=f"{REPO_OWNER}/{model_class}",
filename=MODEL_FILES[self.model_name],
)
# TODO: Use aria2
# https://stackoverflow.com/a/57030446
with requests.get(url, stream=True) as r:
r.raise_for_status()
bar = tqdm(
desc="Downloading RWKV Model",
unit="B",
unit_scale=True,
total=int(r.headers["Content-Length"]),
)
with open(model_path, "wb") as file:
for chunk in r.iter_content(chunk_size=8192):
if not chunk:
continue
file.write(chunk)
bar.update(len(chunk))
def _load(self, save_model: bool, initial_load: bool) -> None:
self._ensure_directory_structure()
self.tokenizer = self._get_tokenizer()
# Parse model name
model_class, _, special = self.model_name.partition(":")
special = special or None
model_dir = os.path.join("models", "rwkv", "models", model_class)
if not os.path.exists(model_dir):
os.mkdir(model_dir)
# Download model if we need to
model_path = os.path.join(model_dir, MODEL_FILES[self.model_name])
if not os.path.exists(model_path):
self._download_model(model_path, model_class)
# Now we load!
# TODO: Breakmodel to strat
from rwkv.model import RWKV
self.model = RWKV(model=model_path, strategy="cuda:0 fp16")
def _apply_warpers(
self, scores: torch.Tensor, input_ids: torch.Tensor
) -> torch.Tensor:
warpers.update_settings()
for sid in utils.koboldai_vars.sampler_order:
warper = Warper.from_id(sid)
if not warper.value_is_valid():
continue
if warper == warpers.RepetitionPenalty:
# Rep pen needs more data than other samplers
scores = warper.torch(scores, input_ids=input_ids)
else:
scores = warper.torch(scores)
return scores
def _sample_token(self, logits: torch.Tensor, input_ids: torch.Tensor) -> int:
probs = F.softmax(logits.float(), dim=-1)
if probs.device == torch.device("cpu"):
probs = probs.numpy()
sorted_ids = np.argsort(probs)
sorted_probs = probs[sorted_ids][::-1]
probs = self._apply_warpers(probs[None, :], input_ids)
# TODO: is this right?
probs[probs == -torch.inf] = 0.0
probs = probs / np.sum(probs)
out = np.random.choice(a=len(probs), p=probs)
return int(out)
else:
sorted_ids = torch.argsort(probs)
sorted_probs = probs[sorted_ids]
sorted_probs = torch.flip(sorted_probs, dims=(0,))
probs = self._apply_warpers(probs[None, :], input_ids)
# TODO: is this right?
probs[probs == -torch.inf] = 0.0
out = torch.multinomial(probs, num_samples=1)[0]
return int(out)
def _raw_generate(
self,
prompt_tokens: Union[List[int], torch.Tensor],
max_new: int,
gen_settings: GenerationSettings,
single_line: bool = False,
batch_count: int = 1,
seed: Optional[int] = None,
**kwargs,
) -> GenerationResult:
if seed is not None:
torch.manual_seed(seed)
aux_device = utils.get_auxilary_device()
context = torch.tensor(prompt_tokens)[None, :].to(aux_device)
out = []
start_time = time.time()
with torch.no_grad():
logits, state = self.model.forward(prompt_tokens, None)
last_token = prompt_tokens[-1]
for _ in range(max_new):
logits, state = self.model.forward([last_token], state)
last_token = self._sample_token(logits, context)
out.append(last_token)
add = torch.tensor([[last_token]]).to(aux_device)
context = torch.cat((context, add), dim=-1)
self._post_token_gen(context)
logger.debug(
"torch_raw_generate: run generator {}s".format(time.time() - start_time)
)
return GenerationResult(
self,
out_batches=torch.tensor([out]),
prompt=prompt_tokens,
is_whole_generation=False,
output_includes_prompt=True,
)