Moved model backends to separate folders

added some model backend settings save/load
This commit is contained in:
ebolam
2023-05-18 20:14:33 -04:00
parent 4040538d34
commit 06f59a7b7b
14 changed files with 69 additions and 409 deletions

View File

@@ -626,9 +626,9 @@ import importlib
model_backend_code = {} model_backend_code = {}
model_backends = {} model_backends = {}
for module in os.listdir("./modeling/inference_models"): for module in os.listdir("./modeling/inference_models"):
if os.path.isfile(os.path.join("./modeling/inference_models",module)) and module[-3:] == '.py': if not os.path.isfile(os.path.join("./modeling/inference_models",module)) and module != '__pycache__':
model_backend_code[module[:-3]] = importlib.import_module('modeling.inference_models.{}'.format(module[:-3])) model_backend_code[module] = importlib.import_module('modeling.inference_models.{}.class'.format(module))
model_backends[model_backend_code[module[:-3]].model_backend_name] = model_backend_code[module[:-3]].model_backend() model_backends[model_backend_code[module].model_backend_name] = model_backend_code[module].model_backend()
old_socketio_on = socketio.on old_socketio_on = socketio.on

View File

@@ -188,6 +188,7 @@ class InferenceModel:
self._pre_load() self._pre_load()
self._load(save_model=save_model, initial_load=initial_load) self._load(save_model=save_model, initial_load=initial_load)
self._post_load() self._post_load()
self._save_settings()
def unload(self): def unload(self):
return return
@@ -198,6 +199,9 @@ class InferenceModel:
def _post_load(self) -> None: def _post_load(self) -> None:
"""Post load hook. Called after `_load()`.""" """Post load hook. Called after `_load()`."""
def _save_settings(self) -> None:
"""Save settings hook. Called after `_post_load()`."""
def _load(self, save_model: bool, initial_load: bool) -> None: def _load(self, save_model: bool, initial_load: bool) -> None:
"""Main load method. All logic related to loading the model onto the """Main load method. All logic related to loading the model onto the
selected device(s) and preparing it for inference should be implemented here.""" selected device(s) and preparing it for inference should be implemented here."""

View File

@@ -26,19 +26,22 @@ class APIException(Exception):
class model_backend(InferenceModel): class model_backend(InferenceModel):
def __init__(self) -> None: def __init__(self) -> None:
super().__init__() super().__init__()
#self.base_url = "" self.base_url = ""
def is_valid(self, model_name, model_path, menu_path): def is_valid(self, model_name, model_path, menu_path):
return model_name == "API" return model_name == "API"
def get_requested_parameters(self, model_name, model_path, menu_path): def get_requested_parameters(self, model_name, model_path, menu_path):
if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self):
with open("settings/api.model_backend.settings", "r") as f:
self.base_url = json.load(f)['base_url']
requested_parameters = [] requested_parameters = []
requested_parameters.append({ requested_parameters.append({
"uitype": "text", "uitype": "text",
"unit": "text", "unit": "text",
"label": "URL", "label": "URL",
"id": "base_url", "id": "base_url",
"default": False, "default": self.base_url,
"check": {"value": "", 'check': "!="}, "check": {"value": "", 'check': "!="},
"tooltip": "The URL of the KoboldAI API to connect to.", "tooltip": "The URL of the KoboldAI API to connect to.",
"menu_path": "", "menu_path": "",
@@ -58,6 +61,10 @@ class model_backend(InferenceModel):
# Do not allow API to be served over the API # Do not allow API to be served over the API
self.capabilties = ModelCapabilities(api_host=False) self.capabilties = ModelCapabilities(api_host=False)
def _save_settings(self):
with open("settings/api.model_backend.settings", "w") as f:
json.dump({"base_url": self.base_url}, f, indent="")
def _raw_generate( def _raw_generate(
self, self,
prompt_tokens: Union[List[int], torch.Tensor], prompt_tokens: Union[List[int], torch.Tensor],

View File

@@ -24,6 +24,7 @@ class BasicAPIException(Exception):
class model_backend(InferenceModel): class model_backend(InferenceModel):
def __init__(self) -> None: def __init__(self) -> None:
super().__init__() super().__init__()
self.colaburl = ""
# Do not allow API to be served over the API # Do not allow API to be served over the API
self.capabilties = ModelCapabilities(api_host=False) self.capabilties = ModelCapabilities(api_host=False)
@@ -32,13 +33,16 @@ class model_backend(InferenceModel):
return model_name == "Colab" return model_name == "Colab"
def get_requested_parameters(self, model_name, model_path, menu_path): def get_requested_parameters(self, model_name, model_path, menu_path):
if os.path.exists("settings/api.model_backend.settings") and 'colaburl' not in vars(self):
with open("settings/api.model_backend.settings", "r") as f:
self.colaburl = json.load(f)['base_url']
requested_parameters = [] requested_parameters = []
requested_parameters.append({ requested_parameters.append({
"uitype": "text", "uitype": "text",
"unit": "text", "unit": "text",
"label": "URL", "label": "URL",
"id": "colaburl", "id": "colaburl",
"default": False, "default": self.colaburl,
"check": {"value": "", 'check': "!="}, "check": {"value": "", 'check': "!="},
"tooltip": "The URL of the Colab KoboldAI API to connect to.", "tooltip": "The URL of the Colab KoboldAI API to connect to.",
"menu_path": "", "menu_path": "",
@@ -56,6 +60,10 @@ class model_backend(InferenceModel):
def _load(self, save_model: bool, initial_load: bool) -> None: def _load(self, save_model: bool, initial_load: bool) -> None:
self.tokenizer = self._get_tokenizer("EleutherAI/gpt-neo-2.7B") self.tokenizer = self._get_tokenizer("EleutherAI/gpt-neo-2.7B")
def _save_settings(self):
with open("settings/basic_api.model_backend.settings", "w") as f:
json.dump({"colaburl": self.colaburl}, f, indent="")
def _raw_generate( def _raw_generate(
self, self,
prompt_tokens: Union[List[int], torch.Tensor], prompt_tokens: Union[List[int], torch.Tensor],

View File

@@ -20,7 +20,7 @@ except ModuleNotFoundError as e:
if not utils.koboldai_vars.use_colab_tpu: if not utils.koboldai_vars.use_colab_tpu:
raise e raise e
from modeling.inference_models.parents.hf_torch import HFTorchInferenceModel from modeling.inference_models.hf_torch import HFTorchInferenceModel
model_backend_name = "Huggingface" model_backend_name = "Huggingface"
@@ -270,3 +270,7 @@ class model_backend(HFTorchInferenceModel):
self.model.kai_model = self self.model.kai_model = self
utils.koboldai_vars.modeldim = self.get_hidden_size() utils.koboldai_vars.modeldim = self.get_hidden_size()
def _save_settings(self):
with open("settings/{}.generic_hf_torch.model_backend.settings".format(self.model_name.replace("/", "_")), "w") as f:
json.dump({"layers": self.layers if 'layers' in vars(self) else [], "disk_layers": self.disk_layers if 'disk_layers' in vars(self) else 0}, f, indent="")

View File

@@ -11,14 +11,14 @@ from modeling.inference_model import (
InferenceModel, InferenceModel,
) )
from modeling.inference_models.parents.openai_gooseai import model_backend as openai_gooseai_model_backend from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend
model_backend_name = "GooseAI" model_backend_name = "GooseAI"
class OpenAIAPIError(Exception): class OpenAIAPIError(Exception):
def __init__(self, error_type: str, error_message) -> None: def __init__(self, error_type: str, error_message) -> None:
super().__init__(f"{error_type}: {error_message}") super().__init__(f"{error_type}: {error_message}")
self.source = "GooseAI"
class model_backend(openai_gooseai_model_backend): class model_backend(openai_gooseai_model_backend):

View File

@@ -3,6 +3,7 @@ from typing import Optional
from transformers import AutoConfig from transformers import AutoConfig
import warnings import warnings
import utils import utils
import json
import koboldai_settings import koboldai_settings
from logger import logger from logger import logger
from modeling.inference_model import InferenceModel from modeling.inference_model import InferenceModel
@@ -44,16 +45,15 @@ class HFInferenceModel(InferenceModel):
self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache") self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
if layer_count is not None and layer_count >= 0 and not self.nobreakmodel: if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
if os.path.exists("settings/{}.breakmodel".format(model_name.replace("/", "_"))): if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
with open("settings/{}.breakmodel".format(model_name.replace("/", "_")), "r") as file: with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
data = [x for x in file.read().split("\n")[:2] if x != ''] temp = json.load(f)
if len(data) < 2: break_values = temp['layers'] if 'layers' in temp else [layer_count]
data.append("0") disk_blocks = temp['disk_layers'] if 'disk_layers' in temp else 0
break_values, disk_blocks = data
break_values = break_values.split(",")
else: else:
break_values = [layer_count] break_values = [layer_count]
disk_blocks = 0 disk_blocks = 0
break_values = [int(x) for x in break_values if x != '' and x is not None] break_values = [int(x) for x in break_values if x != '' and x is not None]
gpu_count = torch.cuda.device_count() gpu_count = torch.cuda.device_count()
break_values += [0] * (gpu_count - len(break_values)) break_values += [0] * (gpu_count - len(break_values))
@@ -132,8 +132,15 @@ class HFInferenceModel(InferenceModel):
if layer_count is not None and layer_count >= 0 and not self.nobreakmodel: if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
gpu_count = torch.cuda.device_count() gpu_count = torch.cuda.device_count()
layers = [] layers = []
logger.info(parameters)
for i in range(gpu_count): for i in range(gpu_count):
layers.append(int(parameters["{}_Layers".format(i)]) if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric() else None) logger.info(parameters["{}_Layers".format(i)])
if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric():
layers.append(int(parameters["{}_Layers".format(i)]))
elif isinstance(parameters["{}_Layers".format(i)], str):
layers.append(None)
else:
layers.append(parameters["{}_Layers".format(i)])
self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None
if isinstance(self.cpu_layers, str): if isinstance(self.cpu_layers, str):
self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0 self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0

View File

@@ -16,7 +16,7 @@ from modeling.inference_model import (
GenerationSettings, GenerationSettings,
ModelCapabilities, ModelCapabilities,
) )
from modeling.inference_models.parents.hf import HFInferenceModel from modeling.inference_models.hf import HFInferenceModel
from modeling.tokenizer import GenericTokenizer from modeling.tokenizer import GenericTokenizer
model_backend_name = "Huggingface MTJ" model_backend_name = "Huggingface MTJ"

View File

@@ -31,7 +31,7 @@ from modeling import warpers
from modeling.warpers import Warper from modeling.warpers import Warper
from modeling.stoppers import Stoppers from modeling.stoppers import Stoppers
from modeling.post_token_hooks import PostTokenHooks from modeling.post_token_hooks import PostTokenHooks
from modeling.inference_models.parents.hf import HFInferenceModel from modeling.inference_models.hf import HFInferenceModel
from modeling.inference_model import ( from modeling.inference_model import (
GenerationResult, GenerationResult,
GenerationSettings, GenerationSettings,
@@ -823,135 +823,10 @@ class HFTorchInferenceModel(HFInferenceModel):
breakmodel.gpu_blocks = [0] * n_layers breakmodel.gpu_blocks = [0] * n_layers
return return
elif ( elif breakmodel.gpu_blocks != []:
utils.args.breakmodel_gpulayers is not None
or utils.args.breakmodel_disklayers is not None
or breakmodel.gpu_blocks != []
):
try:
if breakmodel.gpu_blocks == []:
if utils.args.breakmodel_gpulayers:
breakmodel.gpu_blocks = list(
map(int, utils.args.breakmodel_gpulayers.split(","))
)
assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
s = n_layers
for i in range(len(breakmodel.gpu_blocks)):
if breakmodel.gpu_blocks[i] <= -1:
breakmodel.gpu_blocks[i] = s
break
else:
s -= breakmodel.gpu_blocks[i]
assert sum(breakmodel.gpu_blocks) <= n_layers
n_layers -= sum(breakmodel.gpu_blocks)
n_layers -= breakmodel.disk_blocks
except:
logger.warning(
"--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0."
)
breakmodel.gpu_blocks = [n_layers]
n_layers = 0
elif utils.args.breakmodel_layers is not None:
breakmodel.gpu_blocks = [
n_layers - max(0, min(n_layers, utils.args.breakmodel_layers))
]
n_layers -= sum(breakmodel.gpu_blocks)
elif utils.args.model is not None:
logger.info("Breakmodel not specified, assuming GPU 0") logger.info("Breakmodel not specified, assuming GPU 0")
breakmodel.gpu_blocks = [n_layers] breakmodel.gpu_blocks = [n_layers]
n_layers = 0 n_layers = 0
else:
device_count = torch.cuda.device_count()
if device_count > 1:
print(
Colors.CYAN
+ "\nPlease select one of your GPUs to be your primary GPU."
)
print(
"VRAM usage in your primary GPU will be higher than for your other ones."
)
print("It is recommended you make your fastest GPU your primary GPU.")
self.breakmodel_device_list(n_layers)
while True:
primaryselect = input("device ID> ")
if (
primaryselect.isnumeric()
and 0 <= int(primaryselect) < device_count
):
breakmodel.primary_device = int(primaryselect)
break
else:
print(
f"{Colors.RED}Please enter an integer between 0 and {device_count-1}.{Colors.END}"
)
else:
breakmodel.primary_device = 0
print(
Colors.PURPLE
+ "\nIf you don't have enough VRAM to run the model on a single GPU"
)
print(
"you can split the model between your CPU and your GPU(s), or between"
)
print("multiple GPUs if you have more than one.")
print("By putting more 'layers' on a GPU or CPU, more computations will be")
print(
"done on that device and more VRAM or RAM will be required on that device"
)
print("(roughly proportional to number of layers).")
print(
"It should be noted that GPUs are orders of magnitude faster than the CPU."
)
print(
f"This model has{Colors.YELLOW} {n_layers} {Colors.PURPLE}layers.{Colors.END}\n"
)
for i in range(device_count):
self.breakmodel_device_list(
n_layers, primary=breakmodel.primary_device, selected=i
)
print(
f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
)
while True:
layerselect = input("# of layers> ")
if (
layerselect.isnumeric() or layerselect.strip() == "-1"
) and -1 <= int(layerselect) <= n_layers:
layerselect = int(layerselect)
layerselect = n_layers if layerselect == -1 else layerselect
breakmodel.gpu_blocks.append(layerselect)
n_layers -= layerselect
break
else:
print(
f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
)
if n_layers == 0:
break
if n_layers > 0:
self.breakmodel_device_list(
n_layers, primary=breakmodel.primary_device, selected=-1
)
print(
f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
)
while True:
layerselect = input("# of layers> ")
if (
layerselect.isnumeric() or layerselect.strip() == "-1"
) and -1 <= int(layerselect) <= n_layers:
layerselect = int(layerselect)
layerselect = n_layers if layerselect == -1 else layerselect
breakmodel.disk_blocks = layerselect
n_layers -= layerselect
break
else:
print(
f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
)
logger.init_ok("Final device configuration:", status="Info") logger.init_ok("Final device configuration:", status="Info")
self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device) self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)

View File

@@ -38,6 +38,11 @@ class model_backend(InferenceModel):
return model_name == "CLUSTER" or model_name in [x['value'] for x in self.models] return model_name == "CLUSTER" or model_name in [x['value'] for x in self.models]
def get_requested_parameters(self, model_name, model_path, menu_path): def get_requested_parameters(self, model_name, model_path, menu_path):
if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self):
with open("settings/horde.model_backend.settings", "r") as f:
temp = json.load(f)
self.base_url = temp['url']
self.key = temp['key']
requested_parameters = [] requested_parameters = []
requested_parameters.extend([{ requested_parameters.extend([{
"uitype": "text", "uitype": "text",
@@ -122,6 +127,10 @@ class model_backend(InferenceModel):
#else "gpt2", #else "gpt2",
) )
def _save_settings(self):
with open("settings/horde.model_backend.settings", "w") as f:
json.dump({"key": self.key, "url": self.url}, f, indent="")
def _raw_generate( def _raw_generate(
self, self,
prompt_tokens: Union[List[int], torch.Tensor], prompt_tokens: Union[List[int], torch.Tensor],

View File

@@ -11,13 +11,14 @@ from modeling.inference_model import (
InferenceModel, InferenceModel,
) )
from modeling.inference_models.parents.openai_gooseai import model_backend as openai_gooseai_model_backend from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend
model_backend_name = "OpenAI" model_backend_name = "OpenAI"
class OpenAIAPIError(Exception): class OpenAIAPIError(Exception):
def __init__(self, error_type: str, error_message) -> None: def __init__(self, error_type: str, error_message) -> None:
super().__init__(f"{error_type}: {error_message}") super().__init__(f"{error_type}: {error_message}")
self.source = "OpenAI"
class model_backend(openai_gooseai_model_backend): class model_backend(openai_gooseai_model_backend):

View File

@@ -25,15 +25,14 @@ class model_backend(InferenceModel):
super().__init__() super().__init__()
self.key = "" self.key = ""
self.url = "https://api.goose.ai/v1/engines" self.url = "https://api.goose.ai/v1/engines"
#if self.source == 'OAI':
# url = "https://api.openai.com/v1/engines"
#elif self.source == 'GooseAI':
# url = "https://api.goose.ai/v1/engines"
def is_valid(self, model_name, model_path, menu_path): def is_valid(self, model_name, model_path, menu_path):
return model_name == "OAI" or model_name == "GooseAI" return model_name == "OAI" or model_name == "GooseAI"
def get_requested_parameters(self, model_name, model_path, menu_path): def get_requested_parameters(self, model_name, model_path, menu_path):
if os.path.exists("settings/{}.model_backend.settings".format(self.source)) and 'colaburl' not in vars(self):
with open("settings/{}.model_backend.settings".format(self.source), "r") as f:
self.key = json.load(f)['key']
self.source = model_name self.source = model_name
requested_parameters = [] requested_parameters = []
requested_parameters.extend([{ requested_parameters.extend([{
@@ -41,7 +40,7 @@ class model_backend(InferenceModel):
"unit": "text", "unit": "text",
"label": "Key", "label": "Key",
"id": "key", "id": "key",
"default": "", "default": self.key,
"check": {"value": "", 'check': "!="}, "check": {"value": "", 'check': "!="},
"tooltip": "User Key to use when connecting to OpenAI/GooseAI.", "tooltip": "User Key to use when connecting to OpenAI/GooseAI.",
"menu_path": "", "menu_path": "",
@@ -106,6 +105,10 @@ class model_backend(InferenceModel):
def _load(self, save_model: bool, initial_load: bool) -> None: def _load(self, save_model: bool, initial_load: bool) -> None:
self.tokenizer = self._get_tokenizer("gpt2") self.tokenizer = self._get_tokenizer("gpt2")
def _save_settings(self):
with open("settings/{}.model_backend.settings".format(self.source), "w") as f:
json.dump({"key": self.key}, f, indent="")
def _raw_generate( def _raw_generate(
self, self,
prompt_tokens: Union[List[int], torch.Tensor], prompt_tokens: Union[List[int], torch.Tensor],

View File

@@ -1,258 +0,0 @@
from __future__ import annotations
import os
import time
from typing import Dict, List, Optional, Union
import numpy as np
import requests
from tokenizers import Tokenizer
from tqdm import tqdm
from huggingface_hub import hf_hub_url
import torch
from torch.nn import functional as F
# Must be defined before import
os.environ["RWKV_JIT_ON"] = "1"
# TODO: Include compiled kernel
os.environ["RWKV_CUDA_ON"] = "1"
import utils
from logger import logger
from modeling import warpers
from modeling.warpers import Warper
from modeling.stoppers import Stoppers
from modeling.post_token_hooks import PostTokenHooks
from modeling.tokenizer import GenericTokenizer
from modeling.inference_model import (
GenerationResult,
GenerationSettings,
InferenceModel,
ModelCapabilities,
)
TOKENIZER_URL = (
"https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/20B_tokenizer.json"
)
TOKENIZER_PATH = "models/rwkv/20b_tokenizer.json"
REPO_OWNER = "BlinkDL"
MODEL_FILES = {
"rwkv-4-pile-14b": "RWKV-4-Pile-14B-20230213-8019.pth",
# NOTE: Still in progress(?)
"rwkv-4-pile-14b:ctx4096": "RWKV-4-Pile-14B-20230228-ctx4096-test663.pth",
"rwkv-4-pile-7b": "RWKV-4-Pile-7B-20221115-8047.pth",
"rwkv-4-pile-7b:ctx4096": "RWKV-4-Pile-7B-20230109-ctx4096.pth",
"rwkv-4-pile-3b": "RWKV-4-Pile-3B-20221008-8023.pth",
"rwkv-4-pile-3b:ctx4096": "RWKV-4-Pile-3B-20221110-ctx4096.pth",
"rwkv-4-pile-1b5": "RWKV-4-Pile-1B5-20220903-8040.pth",
"rwkv-4-pile-1b5:ctx4096": "RWKV-4-Pile-1B5-20220929-ctx4096.pth",
"rwkv-4-pile-430m": "RWKV-4-Pile-430M-20220808-8066.pth",
"rwkv-4-pile-169m": "RWKV-4-Pile-169M-20220807-8023.pth",
}
model_backend_name = "RWKV"
class model_backend(InferenceModel):
def __init__(
self,
#model_name: str,
) -> None:
super().__init__()
#self.model_name = model_name
self.post_token_hooks = [
PostTokenHooks.stream_tokens,
]
self.stopper_hooks = [
Stoppers.core_stopper,
Stoppers.dynamic_wi_scanner,
Stoppers.singleline_stopper,
Stoppers.chat_mode_stopper,
Stoppers.stop_sequence_stopper,
]
self.capabilties = ModelCapabilities(
embedding_manipulation=False,
post_token_hooks=True,
stopper_hooks=True,
post_token_probs=True,
)
self._old_stopping_criteria = None
def is_valid(self, model_name, model_path, menu_path):
try:
from rwkv.model import RWKV
valid = True
except:
valid = False
return valid and "rwkv" in model_name.lower()
def get_requested_parameters(self, model_name, model_path, menu_path):
self.source = model_name
requested_parameters = []
return requested_parameters
def set_input_parameters(self):
return
def _ensure_directory_structure(self) -> None:
for path in ["models/rwkv", "models/rwkv/models"]:
try:
os.mkdir(path)
except FileExistsError:
pass
def _get_tokenizer(self) -> GenericTokenizer:
if not os.path.exists(TOKENIZER_PATH):
logger.info("RWKV tokenizer not found, downloading...")
r = requests.get(TOKENIZER_URL)
with open(TOKENIZER_PATH, "wb") as file:
file.write(r.content)
return GenericTokenizer(Tokenizer.from_file(TOKENIZER_PATH))
def _download_model(self, model_path: str, model_class: str) -> None:
logger.info(f"{self.model_name} not found, downloading...")
url = hf_hub_url(
repo_id=f"{REPO_OWNER}/{model_class}",
filename=MODEL_FILES[self.model_name],
)
# TODO: Use aria2
# https://stackoverflow.com/a/57030446
with requests.get(url, stream=True) as r:
r.raise_for_status()
bar = tqdm(
desc="Downloading RWKV Model",
unit="B",
unit_scale=True,
total=int(r.headers["Content-Length"]),
)
with open(model_path, "wb") as file:
for chunk in r.iter_content(chunk_size=8192):
if not chunk:
continue
file.write(chunk)
bar.update(len(chunk))
def _load(self, save_model: bool, initial_load: bool) -> None:
self._ensure_directory_structure()
self.tokenizer = self._get_tokenizer()
# Parse model name
model_class, _, special = self.model_name.partition(":")
special = special or None
model_dir = os.path.join("models", "rwkv", "models", model_class)
if not os.path.exists(model_dir):
os.mkdir(model_dir)
# Download model if we need to
model_path = os.path.join(model_dir, MODEL_FILES[self.model_name])
if not os.path.exists(model_path):
self._download_model(model_path, model_class)
# Now we load!
# TODO: Breakmodel to strat
from rwkv.model import RWKV
self.model = RWKV(model=model_path, strategy="cuda:0 fp16")
def _apply_warpers(
self, scores: torch.Tensor, input_ids: torch.Tensor
) -> torch.Tensor:
warpers.update_settings()
for sid in utils.koboldai_vars.sampler_order:
warper = Warper.from_id(sid)
if not warper.value_is_valid():
continue
if warper == warpers.RepetitionPenalty:
# Rep pen needs more data than other samplers
scores = warper.torch(scores, input_ids=input_ids)
else:
scores = warper.torch(scores)
return scores
def _sample_token(self, logits: torch.Tensor, input_ids: torch.Tensor) -> int:
probs = F.softmax(logits.float(), dim=-1)
if probs.device == torch.device("cpu"):
probs = probs.numpy()
sorted_ids = np.argsort(probs)
sorted_probs = probs[sorted_ids][::-1]
probs = self._apply_warpers(probs[None, :], input_ids)
# TODO: is this right?
probs[probs == -torch.inf] = 0.0
probs = probs / np.sum(probs)
out = np.random.choice(a=len(probs), p=probs)
return int(out)
else:
sorted_ids = torch.argsort(probs)
sorted_probs = probs[sorted_ids]
sorted_probs = torch.flip(sorted_probs, dims=(0,))
probs = self._apply_warpers(probs[None, :], input_ids)
# TODO: is this right?
probs[probs == -torch.inf] = 0.0
out = torch.multinomial(probs, num_samples=1)[0]
return int(out)
def _raw_generate(
self,
prompt_tokens: Union[List[int], torch.Tensor],
max_new: int,
gen_settings: GenerationSettings,
single_line: bool = False,
batch_count: int = 1,
seed: Optional[int] = None,
**kwargs,
) -> GenerationResult:
if seed is not None:
torch.manual_seed(seed)
aux_device = utils.get_auxilary_device()
context = torch.tensor(prompt_tokens)[None, :].to(aux_device)
out = []
start_time = time.time()
with torch.no_grad():
logits, state = self.model.forward(prompt_tokens, None)
last_token = prompt_tokens[-1]
for _ in range(max_new):
logits, state = self.model.forward([last_token], state)
last_token = self._sample_token(logits, context)
out.append(last_token)
add = torch.tensor([[last_token]]).to(aux_device)
context = torch.cat((context, add), dim=-1)
self._post_token_gen(context)
logger.debug(
"torch_raw_generate: run generator {}s".format(time.time() - start_time)
)
return GenerationResult(
self,
out_batches=torch.tensor([out]),
prompt=prompt_tokens,
is_whole_generation=False,
output_includes_prompt=True,
)