diff --git a/aiserver.py b/aiserver.py index aeebdbc1..6a87d8d3 100644 --- a/aiserver.py +++ b/aiserver.py @@ -626,9 +626,9 @@ import importlib model_backend_code = {} model_backends = {} for module in os.listdir("./modeling/inference_models"): - if os.path.isfile(os.path.join("./modeling/inference_models",module)) and module[-3:] == '.py': - model_backend_code[module[:-3]] = importlib.import_module('modeling.inference_models.{}'.format(module[:-3])) - model_backends[model_backend_code[module[:-3]].model_backend_name] = model_backend_code[module[:-3]].model_backend() + if not os.path.isfile(os.path.join("./modeling/inference_models",module)) and module != '__pycache__': + model_backend_code[module] = importlib.import_module('modeling.inference_models.{}.class'.format(module)) + model_backends[model_backend_code[module].model_backend_name] = model_backend_code[module].model_backend() old_socketio_on = socketio.on diff --git a/modeling/inference_model.py b/modeling/inference_model.py index c3fff46f..491d2b05 100644 --- a/modeling/inference_model.py +++ b/modeling/inference_model.py @@ -188,6 +188,7 @@ class InferenceModel: self._pre_load() self._load(save_model=save_model, initial_load=initial_load) self._post_load() + self._save_settings() def unload(self): return @@ -197,6 +198,9 @@ class InferenceModel: def _post_load(self) -> None: """Post load hook. Called after `_load()`.""" + + def _save_settings(self) -> None: + """Save settings hook. Called after `_post_load()`.""" def _load(self, save_model: bool, initial_load: bool) -> None: """Main load method. All logic related to loading the model onto the diff --git a/modeling/inference_models/api.py b/modeling/inference_models/api/class.py similarity index 89% rename from modeling/inference_models/api.py rename to modeling/inference_models/api/class.py index 409158f5..d9ec1147 100644 --- a/modeling/inference_models/api.py +++ b/modeling/inference_models/api/class.py @@ -26,19 +26,22 @@ class APIException(Exception): class model_backend(InferenceModel): def __init__(self) -> None: super().__init__() - #self.base_url = "" + self.base_url = "" def is_valid(self, model_name, model_path, menu_path): return model_name == "API" def get_requested_parameters(self, model_name, model_path, menu_path): + if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self): + with open("settings/api.model_backend.settings", "r") as f: + self.base_url = json.load(f)['base_url'] requested_parameters = [] requested_parameters.append({ "uitype": "text", "unit": "text", "label": "URL", "id": "base_url", - "default": False, + "default": self.base_url, "check": {"value": "", 'check': "!="}, "tooltip": "The URL of the KoboldAI API to connect to.", "menu_path": "", @@ -58,6 +61,10 @@ class model_backend(InferenceModel): # Do not allow API to be served over the API self.capabilties = ModelCapabilities(api_host=False) + def _save_settings(self): + with open("settings/api.model_backend.settings", "w") as f: + json.dump({"base_url": self.base_url}, f, indent="") + def _raw_generate( self, prompt_tokens: Union[List[int], torch.Tensor], diff --git a/modeling/inference_models/basic_api.py b/modeling/inference_models/basic_api/class.py similarity index 88% rename from modeling/inference_models/basic_api.py rename to modeling/inference_models/basic_api/class.py index cca9652b..6f045ef5 100644 --- a/modeling/inference_models/basic_api.py +++ b/modeling/inference_models/basic_api/class.py @@ -24,6 +24,7 @@ class BasicAPIException(Exception): class model_backend(InferenceModel): def __init__(self) -> None: super().__init__() + self.colaburl = "" # Do not allow API to be served over the API self.capabilties = ModelCapabilities(api_host=False) @@ -32,13 +33,16 @@ class model_backend(InferenceModel): return model_name == "Colab" def get_requested_parameters(self, model_name, model_path, menu_path): + if os.path.exists("settings/api.model_backend.settings") and 'colaburl' not in vars(self): + with open("settings/api.model_backend.settings", "r") as f: + self.colaburl = json.load(f)['base_url'] requested_parameters = [] requested_parameters.append({ "uitype": "text", "unit": "text", "label": "URL", "id": "colaburl", - "default": False, + "default": self.colaburl, "check": {"value": "", 'check': "!="}, "tooltip": "The URL of the Colab KoboldAI API to connect to.", "menu_path": "", @@ -55,6 +59,10 @@ class model_backend(InferenceModel): def _load(self, save_model: bool, initial_load: bool) -> None: self.tokenizer = self._get_tokenizer("EleutherAI/gpt-neo-2.7B") + + def _save_settings(self): + with open("settings/basic_api.model_backend.settings", "w") as f: + json.dump({"colaburl": self.colaburl}, f, indent="") def _raw_generate( self, diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch/class.py similarity index 97% rename from modeling/inference_models/generic_hf_torch.py rename to modeling/inference_models/generic_hf_torch/class.py index f7a00f45..4e2c8a5b 100644 --- a/modeling/inference_models/generic_hf_torch.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -20,7 +20,7 @@ except ModuleNotFoundError as e: if not utils.koboldai_vars.use_colab_tpu: raise e -from modeling.inference_models.parents.hf_torch import HFTorchInferenceModel +from modeling.inference_models.hf_torch import HFTorchInferenceModel model_backend_name = "Huggingface" @@ -270,3 +270,7 @@ class model_backend(HFTorchInferenceModel): self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() + + def _save_settings(self): + with open("settings/{}.generic_hf_torch.model_backend.settings".format(self.model_name.replace("/", "_")), "w") as f: + json.dump({"layers": self.layers if 'layers' in vars(self) else [], "disk_layers": self.disk_layers if 'disk_layers' in vars(self) else 0}, f, indent="") \ No newline at end of file diff --git a/modeling/inference_models/gooseai.py b/modeling/inference_models/gooseai/class.py similarity index 80% rename from modeling/inference_models/gooseai.py rename to modeling/inference_models/gooseai/class.py index 9d6e8771..8d58b4b5 100644 --- a/modeling/inference_models/gooseai.py +++ b/modeling/inference_models/gooseai/class.py @@ -11,14 +11,14 @@ from modeling.inference_model import ( InferenceModel, ) -from modeling.inference_models.parents.openai_gooseai import model_backend as openai_gooseai_model_backend - +from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend model_backend_name = "GooseAI" class OpenAIAPIError(Exception): def __init__(self, error_type: str, error_message) -> None: super().__init__(f"{error_type}: {error_message}") + self.source = "GooseAI" class model_backend(openai_gooseai_model_backend): diff --git a/modeling/inference_models/parents/hf.py b/modeling/inference_models/hf.py similarity index 94% rename from modeling/inference_models/parents/hf.py rename to modeling/inference_models/hf.py index 70143b69..bb3f7fe4 100644 --- a/modeling/inference_models/parents/hf.py +++ b/modeling/inference_models/hf.py @@ -3,6 +3,7 @@ from typing import Optional from transformers import AutoConfig import warnings import utils +import json import koboldai_settings from logger import logger from modeling.inference_model import InferenceModel @@ -44,16 +45,15 @@ class HFInferenceModel(InferenceModel): self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache") layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None if layer_count is not None and layer_count >= 0 and not self.nobreakmodel: - if os.path.exists("settings/{}.breakmodel".format(model_name.replace("/", "_"))): - with open("settings/{}.breakmodel".format(model_name.replace("/", "_")), "r") as file: - data = [x for x in file.read().split("\n")[:2] if x != ''] - if len(data) < 2: - data.append("0") - break_values, disk_blocks = data - break_values = break_values.split(",") + if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self): + with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f: + temp = json.load(f) + break_values = temp['layers'] if 'layers' in temp else [layer_count] + disk_blocks = temp['disk_layers'] if 'disk_layers' in temp else 0 else: break_values = [layer_count] disk_blocks = 0 + break_values = [int(x) for x in break_values if x != '' and x is not None] gpu_count = torch.cuda.device_count() break_values += [0] * (gpu_count - len(break_values)) @@ -132,8 +132,15 @@ class HFInferenceModel(InferenceModel): if layer_count is not None and layer_count >= 0 and not self.nobreakmodel: gpu_count = torch.cuda.device_count() layers = [] + logger.info(parameters) for i in range(gpu_count): - layers.append(int(parameters["{}_Layers".format(i)]) if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric() else None) + logger.info(parameters["{}_Layers".format(i)]) + if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric(): + layers.append(int(parameters["{}_Layers".format(i)])) + elif isinstance(parameters["{}_Layers".format(i)], str): + layers.append(None) + else: + layers.append(parameters["{}_Layers".format(i)]) self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None if isinstance(self.cpu_layers, str): self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0 diff --git a/modeling/inference_models/hf_mtj.py b/modeling/inference_models/hf_mtj/class.py similarity index 99% rename from modeling/inference_models/hf_mtj.py rename to modeling/inference_models/hf_mtj/class.py index 6351eca2..4de3a1b2 100644 --- a/modeling/inference_models/hf_mtj.py +++ b/modeling/inference_models/hf_mtj/class.py @@ -16,7 +16,7 @@ from modeling.inference_model import ( GenerationSettings, ModelCapabilities, ) -from modeling.inference_models.parents.hf import HFInferenceModel +from modeling.inference_models.hf import HFInferenceModel from modeling.tokenizer import GenericTokenizer model_backend_name = "Huggingface MTJ" diff --git a/modeling/inference_models/parents/hf_torch.py b/modeling/inference_models/hf_torch.py similarity index 85% rename from modeling/inference_models/parents/hf_torch.py rename to modeling/inference_models/hf_torch.py index f0a4a66e..8fdb8c64 100644 --- a/modeling/inference_models/parents/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -31,7 +31,7 @@ from modeling import warpers from modeling.warpers import Warper from modeling.stoppers import Stoppers from modeling.post_token_hooks import PostTokenHooks -from modeling.inference_models.parents.hf import HFInferenceModel +from modeling.inference_models.hf import HFInferenceModel from modeling.inference_model import ( GenerationResult, GenerationSettings, @@ -823,135 +823,10 @@ class HFTorchInferenceModel(HFInferenceModel): breakmodel.gpu_blocks = [0] * n_layers return - elif ( - utils.args.breakmodel_gpulayers is not None - or utils.args.breakmodel_disklayers is not None - or breakmodel.gpu_blocks != [] - ): - try: - if breakmodel.gpu_blocks == []: - if utils.args.breakmodel_gpulayers: - breakmodel.gpu_blocks = list( - map(int, utils.args.breakmodel_gpulayers.split(",")) - ) - assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count() - s = n_layers - for i in range(len(breakmodel.gpu_blocks)): - if breakmodel.gpu_blocks[i] <= -1: - breakmodel.gpu_blocks[i] = s - break - else: - s -= breakmodel.gpu_blocks[i] - assert sum(breakmodel.gpu_blocks) <= n_layers - n_layers -= sum(breakmodel.gpu_blocks) - n_layers -= breakmodel.disk_blocks - except: - logger.warning( - "--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0." - ) - breakmodel.gpu_blocks = [n_layers] - n_layers = 0 - elif utils.args.breakmodel_layers is not None: - breakmodel.gpu_blocks = [ - n_layers - max(0, min(n_layers, utils.args.breakmodel_layers)) - ] - n_layers -= sum(breakmodel.gpu_blocks) - elif utils.args.model is not None: + elif breakmodel.gpu_blocks != []: logger.info("Breakmodel not specified, assuming GPU 0") breakmodel.gpu_blocks = [n_layers] n_layers = 0 - else: - device_count = torch.cuda.device_count() - if device_count > 1: - print( - Colors.CYAN - + "\nPlease select one of your GPUs to be your primary GPU." - ) - print( - "VRAM usage in your primary GPU will be higher than for your other ones." - ) - print("It is recommended you make your fastest GPU your primary GPU.") - self.breakmodel_device_list(n_layers) - while True: - primaryselect = input("device ID> ") - if ( - primaryselect.isnumeric() - and 0 <= int(primaryselect) < device_count - ): - breakmodel.primary_device = int(primaryselect) - break - else: - print( - f"{Colors.RED}Please enter an integer between 0 and {device_count-1}.{Colors.END}" - ) - else: - breakmodel.primary_device = 0 - - print( - Colors.PURPLE - + "\nIf you don't have enough VRAM to run the model on a single GPU" - ) - print( - "you can split the model between your CPU and your GPU(s), or between" - ) - print("multiple GPUs if you have more than one.") - print("By putting more 'layers' on a GPU or CPU, more computations will be") - print( - "done on that device and more VRAM or RAM will be required on that device" - ) - print("(roughly proportional to number of layers).") - print( - "It should be noted that GPUs are orders of magnitude faster than the CPU." - ) - print( - f"This model has{Colors.YELLOW} {n_layers} {Colors.PURPLE}layers.{Colors.END}\n" - ) - - for i in range(device_count): - self.breakmodel_device_list( - n_layers, primary=breakmodel.primary_device, selected=i - ) - print( - f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n" - ) - while True: - layerselect = input("# of layers> ") - if ( - layerselect.isnumeric() or layerselect.strip() == "-1" - ) and -1 <= int(layerselect) <= n_layers: - layerselect = int(layerselect) - layerselect = n_layers if layerselect == -1 else layerselect - breakmodel.gpu_blocks.append(layerselect) - n_layers -= layerselect - break - else: - print( - f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}" - ) - if n_layers == 0: - break - - if n_layers > 0: - self.breakmodel_device_list( - n_layers, primary=breakmodel.primary_device, selected=-1 - ) - print( - f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n" - ) - while True: - layerselect = input("# of layers> ") - if ( - layerselect.isnumeric() or layerselect.strip() == "-1" - ) and -1 <= int(layerselect) <= n_layers: - layerselect = int(layerselect) - layerselect = n_layers if layerselect == -1 else layerselect - breakmodel.disk_blocks = layerselect - n_layers -= layerselect - break - else: - print( - f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}" - ) logger.init_ok("Final device configuration:", status="Info") self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device) diff --git a/modeling/inference_models/horde.py b/modeling/inference_models/horde/class.py similarity index 95% rename from modeling/inference_models/horde.py rename to modeling/inference_models/horde/class.py index 8e05fbbd..387c5833 100644 --- a/modeling/inference_models/horde.py +++ b/modeling/inference_models/horde/class.py @@ -38,6 +38,11 @@ class model_backend(InferenceModel): return model_name == "CLUSTER" or model_name in [x['value'] for x in self.models] def get_requested_parameters(self, model_name, model_path, menu_path): + if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self): + with open("settings/horde.model_backend.settings", "r") as f: + temp = json.load(f) + self.base_url = temp['url'] + self.key = temp['key'] requested_parameters = [] requested_parameters.extend([{ "uitype": "text", @@ -122,6 +127,10 @@ class model_backend(InferenceModel): #else "gpt2", ) + def _save_settings(self): + with open("settings/horde.model_backend.settings", "w") as f: + json.dump({"key": self.key, "url": self.url}, f, indent="") + def _raw_generate( self, prompt_tokens: Union[List[int], torch.Tensor], diff --git a/modeling/inference_models/openai.py b/modeling/inference_models/openai/class.py similarity index 81% rename from modeling/inference_models/openai.py rename to modeling/inference_models/openai/class.py index 19a7d1e6..84fe6df9 100644 --- a/modeling/inference_models/openai.py +++ b/modeling/inference_models/openai/class.py @@ -11,13 +11,14 @@ from modeling.inference_model import ( InferenceModel, ) -from modeling.inference_models.parents.openai_gooseai import model_backend as openai_gooseai_model_backend +from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend model_backend_name = "OpenAI" class OpenAIAPIError(Exception): def __init__(self, error_type: str, error_message) -> None: super().__init__(f"{error_type}: {error_message}") + self.source = "OpenAI" class model_backend(openai_gooseai_model_backend): diff --git a/modeling/inference_models/parents/openai_gooseai.py b/modeling/inference_models/openai_gooseai.py similarity index 93% rename from modeling/inference_models/parents/openai_gooseai.py rename to modeling/inference_models/openai_gooseai.py index 871ea5ce..4d885074 100644 --- a/modeling/inference_models/parents/openai_gooseai.py +++ b/modeling/inference_models/openai_gooseai.py @@ -25,15 +25,14 @@ class model_backend(InferenceModel): super().__init__() self.key = "" self.url = "https://api.goose.ai/v1/engines" - #if self.source == 'OAI': - # url = "https://api.openai.com/v1/engines" - #elif self.source == 'GooseAI': - # url = "https://api.goose.ai/v1/engines" def is_valid(self, model_name, model_path, menu_path): return model_name == "OAI" or model_name == "GooseAI" def get_requested_parameters(self, model_name, model_path, menu_path): + if os.path.exists("settings/{}.model_backend.settings".format(self.source)) and 'colaburl' not in vars(self): + with open("settings/{}.model_backend.settings".format(self.source), "r") as f: + self.key = json.load(f)['key'] self.source = model_name requested_parameters = [] requested_parameters.extend([{ @@ -41,7 +40,7 @@ class model_backend(InferenceModel): "unit": "text", "label": "Key", "id": "key", - "default": "", + "default": self.key, "check": {"value": "", 'check': "!="}, "tooltip": "User Key to use when connecting to OpenAI/GooseAI.", "menu_path": "", @@ -106,6 +105,10 @@ class model_backend(InferenceModel): def _load(self, save_model: bool, initial_load: bool) -> None: self.tokenizer = self._get_tokenizer("gpt2") + def _save_settings(self): + with open("settings/{}.model_backend.settings".format(self.source), "w") as f: + json.dump({"key": self.key}, f, indent="") + def _raw_generate( self, prompt_tokens: Union[List[int], torch.Tensor], diff --git a/modeling/inference_models/readonly.py b/modeling/inference_models/readonly/class.py similarity index 100% rename from modeling/inference_models/readonly.py rename to modeling/inference_models/readonly/class.py diff --git a/modeling/inference_models/rwkv.py b/modeling/inference_models/rwkv.py deleted file mode 100644 index fa6497b7..00000000 --- a/modeling/inference_models/rwkv.py +++ /dev/null @@ -1,258 +0,0 @@ -from __future__ import annotations -import os - - -import time -from typing import Dict, List, Optional, Union -import numpy as np -import requests -from tokenizers import Tokenizer -from tqdm import tqdm -from huggingface_hub import hf_hub_url - -import torch -from torch.nn import functional as F - -# Must be defined before import -os.environ["RWKV_JIT_ON"] = "1" -# TODO: Include compiled kernel -os.environ["RWKV_CUDA_ON"] = "1" - - -import utils -from logger import logger - -from modeling import warpers -from modeling.warpers import Warper -from modeling.stoppers import Stoppers -from modeling.post_token_hooks import PostTokenHooks -from modeling.tokenizer import GenericTokenizer -from modeling.inference_model import ( - GenerationResult, - GenerationSettings, - InferenceModel, - ModelCapabilities, -) - -TOKENIZER_URL = ( - "https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/20B_tokenizer.json" -) -TOKENIZER_PATH = "models/rwkv/20b_tokenizer.json" - -REPO_OWNER = "BlinkDL" -MODEL_FILES = { - "rwkv-4-pile-14b": "RWKV-4-Pile-14B-20230213-8019.pth", - # NOTE: Still in progress(?) - "rwkv-4-pile-14b:ctx4096": "RWKV-4-Pile-14B-20230228-ctx4096-test663.pth", - "rwkv-4-pile-7b": "RWKV-4-Pile-7B-20221115-8047.pth", - "rwkv-4-pile-7b:ctx4096": "RWKV-4-Pile-7B-20230109-ctx4096.pth", - "rwkv-4-pile-3b": "RWKV-4-Pile-3B-20221008-8023.pth", - "rwkv-4-pile-3b:ctx4096": "RWKV-4-Pile-3B-20221110-ctx4096.pth", - "rwkv-4-pile-1b5": "RWKV-4-Pile-1B5-20220903-8040.pth", - "rwkv-4-pile-1b5:ctx4096": "RWKV-4-Pile-1B5-20220929-ctx4096.pth", - "rwkv-4-pile-430m": "RWKV-4-Pile-430M-20220808-8066.pth", - "rwkv-4-pile-169m": "RWKV-4-Pile-169M-20220807-8023.pth", -} - - -model_backend_name = "RWKV" - - -class model_backend(InferenceModel): - def __init__( - self, - #model_name: str, - ) -> None: - super().__init__() - #self.model_name = model_name - - self.post_token_hooks = [ - PostTokenHooks.stream_tokens, - ] - - self.stopper_hooks = [ - Stoppers.core_stopper, - Stoppers.dynamic_wi_scanner, - Stoppers.singleline_stopper, - Stoppers.chat_mode_stopper, - Stoppers.stop_sequence_stopper, - ] - - self.capabilties = ModelCapabilities( - embedding_manipulation=False, - post_token_hooks=True, - stopper_hooks=True, - post_token_probs=True, - ) - self._old_stopping_criteria = None - - def is_valid(self, model_name, model_path, menu_path): - try: - from rwkv.model import RWKV - valid = True - except: - valid = False - return valid and "rwkv" in model_name.lower() - - def get_requested_parameters(self, model_name, model_path, menu_path): - self.source = model_name - requested_parameters = [] - return requested_parameters - - def set_input_parameters(self): - return - - - def _ensure_directory_structure(self) -> None: - for path in ["models/rwkv", "models/rwkv/models"]: - try: - os.mkdir(path) - except FileExistsError: - pass - - def _get_tokenizer(self) -> GenericTokenizer: - if not os.path.exists(TOKENIZER_PATH): - logger.info("RWKV tokenizer not found, downloading...") - - r = requests.get(TOKENIZER_URL) - with open(TOKENIZER_PATH, "wb") as file: - file.write(r.content) - - return GenericTokenizer(Tokenizer.from_file(TOKENIZER_PATH)) - - def _download_model(self, model_path: str, model_class: str) -> None: - logger.info(f"{self.model_name} not found, downloading...") - - url = hf_hub_url( - repo_id=f"{REPO_OWNER}/{model_class}", - filename=MODEL_FILES[self.model_name], - ) - - # TODO: Use aria2 - # https://stackoverflow.com/a/57030446 - with requests.get(url, stream=True) as r: - r.raise_for_status() - bar = tqdm( - desc="Downloading RWKV Model", - unit="B", - unit_scale=True, - total=int(r.headers["Content-Length"]), - ) - with open(model_path, "wb") as file: - for chunk in r.iter_content(chunk_size=8192): - if not chunk: - continue - file.write(chunk) - bar.update(len(chunk)) - - def _load(self, save_model: bool, initial_load: bool) -> None: - self._ensure_directory_structure() - self.tokenizer = self._get_tokenizer() - - # Parse model name - model_class, _, special = self.model_name.partition(":") - special = special or None - - model_dir = os.path.join("models", "rwkv", "models", model_class) - if not os.path.exists(model_dir): - os.mkdir(model_dir) - - # Download model if we need to - model_path = os.path.join(model_dir, MODEL_FILES[self.model_name]) - if not os.path.exists(model_path): - self._download_model(model_path, model_class) - - # Now we load! - - # TODO: Breakmodel to strat - from rwkv.model import RWKV - self.model = RWKV(model=model_path, strategy="cuda:0 fp16") - - def _apply_warpers( - self, scores: torch.Tensor, input_ids: torch.Tensor - ) -> torch.Tensor: - warpers.update_settings() - for sid in utils.koboldai_vars.sampler_order: - warper = Warper.from_id(sid) - - if not warper.value_is_valid(): - continue - - if warper == warpers.RepetitionPenalty: - # Rep pen needs more data than other samplers - scores = warper.torch(scores, input_ids=input_ids) - else: - scores = warper.torch(scores) - return scores - - def _sample_token(self, logits: torch.Tensor, input_ids: torch.Tensor) -> int: - probs = F.softmax(logits.float(), dim=-1) - - if probs.device == torch.device("cpu"): - probs = probs.numpy() - sorted_ids = np.argsort(probs) - sorted_probs = probs[sorted_ids][::-1] - - probs = self._apply_warpers(probs[None, :], input_ids) - - # TODO: is this right? - probs[probs == -torch.inf] = 0.0 - - probs = probs / np.sum(probs) - out = np.random.choice(a=len(probs), p=probs) - return int(out) - else: - sorted_ids = torch.argsort(probs) - sorted_probs = probs[sorted_ids] - sorted_probs = torch.flip(sorted_probs, dims=(0,)) - - probs = self._apply_warpers(probs[None, :], input_ids) - - # TODO: is this right? - probs[probs == -torch.inf] = 0.0 - - out = torch.multinomial(probs, num_samples=1)[0] - return int(out) - - def _raw_generate( - self, - prompt_tokens: Union[List[int], torch.Tensor], - max_new: int, - gen_settings: GenerationSettings, - single_line: bool = False, - batch_count: int = 1, - seed: Optional[int] = None, - **kwargs, - ) -> GenerationResult: - if seed is not None: - torch.manual_seed(seed) - - aux_device = utils.get_auxilary_device() - context = torch.tensor(prompt_tokens)[None, :].to(aux_device) - out = [] - - start_time = time.time() - with torch.no_grad(): - logits, state = self.model.forward(prompt_tokens, None) - last_token = prompt_tokens[-1] - - for _ in range(max_new): - - logits, state = self.model.forward([last_token], state) - last_token = self._sample_token(logits, context) - out.append(last_token) - add = torch.tensor([[last_token]]).to(aux_device) - context = torch.cat((context, add), dim=-1) - self._post_token_gen(context) - - logger.debug( - "torch_raw_generate: run generator {}s".format(time.time() - start_time) - ) - - return GenerationResult( - self, - out_batches=torch.tensor([out]), - prompt=prompt_tokens, - is_whole_generation=False, - output_includes_prompt=True, - )