Moved model backends to separate folders

added some model backend settings save/load
2025-06-05 21:59:24 +02:00 · 2023-05-18 20:14:33 -04:00
parent 4040538d34
commit 06f59a7b7b
14 changed files with 69 additions and 409 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -626,9 +626,9 @@ import importlib
 model_backend_code = {}
 model_backends = {}
 for module in os.listdir("./modeling/inference_models"):
-    if os.path.isfile(os.path.join("./modeling/inference_models",module)) and module[-3:] == '.py':
+    if not os.path.isfile(os.path.join("./modeling/inference_models",module)) and module != '__pycache__':
-        model_backend_code[module[:-3]] = importlib.import_module('modeling.inference_models.{}'.format(module[:-3]))
+        model_backend_code[module] = importlib.import_module('modeling.inference_models.{}.class'.format(module))
-        model_backends[model_backend_code[module[:-3]].model_backend_name] = model_backend_code[module[:-3]].model_backend()
+        model_backends[model_backend_code[module].model_backend_name] = model_backend_code[module].model_backend()
 old_socketio_on = socketio.on
--- a/modeling/inference_model.py
+++ b/modeling/inference_model.py
@@ -188,6 +188,7 @@ class InferenceModel:
        self._pre_load()
        self._load(save_model=save_model, initial_load=initial_load)
        self._post_load()
        self._save_settings()
    def unload(self):
        return
@@ -198,6 +199,9 @@ class InferenceModel:
    def _post_load(self) -> None:
        """Post load hook. Called after `_load()`."""
    def _save_settings(self) -> None:
        """Save settings hook. Called after `_post_load()`."""
    def _load(self, save_model: bool, initial_load: bool) -> None:
        """Main load method. All logic related to loading the model onto the
        selected device(s) and preparing it for inference should be implemented here."""
--- a/modeling/inference_models/api/class.py
+++ b/modeling/inference_models/api/class.py
@@ -26,19 +26,22 @@ class APIException(Exception):
 class model_backend(InferenceModel):
    def __init__(self) -> None:
        super().__init__()
-        #self.base_url = ""
+        self.base_url = ""
    def is_valid(self, model_name, model_path, menu_path):
        return model_name == "API"
    def get_requested_parameters(self, model_name, model_path, menu_path):
        if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self):
            with open("settings/api.model_backend.settings", "r") as f:
                self.base_url = json.load(f)['base_url']
        requested_parameters = []
        requested_parameters.append({
                                        "uitype": "text",
                                        "unit": "text",
                                        "label": "URL",
                                        "id": "base_url",
-                                        "default": False,
+                                        "default": self.base_url,
                                        "check": {"value": "", 'check': "!="},
                                        "tooltip": "The URL of the KoboldAI API to connect to.",
                                        "menu_path": "",
@@ -58,6 +61,10 @@ class model_backend(InferenceModel):
        # Do not allow API to be served over the API
        self.capabilties = ModelCapabilities(api_host=False)
    def _save_settings(self):
        with open("settings/api.model_backend.settings", "w") as f:
            json.dump({"base_url": self.base_url}, f, indent="")
    def _raw_generate(
        self,
        prompt_tokens: Union[List[int], torch.Tensor],
--- a/modeling/inference_models/basic_api/class.py
+++ b/modeling/inference_models/basic_api/class.py
@@ -24,6 +24,7 @@ class BasicAPIException(Exception):
 class model_backend(InferenceModel):
    def __init__(self) -> None:
        super().__init__()
        self.colaburl = ""
        # Do not allow API to be served over the API
        self.capabilties = ModelCapabilities(api_host=False)
@@ -32,13 +33,16 @@ class model_backend(InferenceModel):
        return model_name == "Colab"
    def get_requested_parameters(self, model_name, model_path, menu_path):
        if os.path.exists("settings/api.model_backend.settings") and 'colaburl' not in vars(self):
            with open("settings/api.model_backend.settings", "r") as f:
                self.colaburl = json.load(f)['base_url']
        requested_parameters = []
        requested_parameters.append({
                                        "uitype": "text",
                                        "unit": "text",
                                        "label": "URL",
                                        "id": "colaburl",
-                                        "default": False,
+                                        "default": self.colaburl,
                                        "check": {"value": "", 'check': "!="},
                                        "tooltip": "The URL of the Colab KoboldAI API to connect to.",
                                        "menu_path": "",
@@ -56,6 +60,10 @@ class model_backend(InferenceModel):
    def _load(self, save_model: bool, initial_load: bool) -> None:
        self.tokenizer = self._get_tokenizer("EleutherAI/gpt-neo-2.7B")
    def _save_settings(self):
        with open("settings/basic_api.model_backend.settings", "w") as f:
            json.dump({"colaburl": self.colaburl}, f, indent="")
    def _raw_generate(
        self,
        prompt_tokens: Union[List[int], torch.Tensor],
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -20,7 +20,7 @@ except ModuleNotFoundError as e:
    if not utils.koboldai_vars.use_colab_tpu:
        raise e
-from modeling.inference_models.parents.hf_torch import HFTorchInferenceModel
+from modeling.inference_models.hf_torch import HFTorchInferenceModel
 model_backend_name = "Huggingface"
@@ -270,3 +270,7 @@ class model_backend(HFTorchInferenceModel):
        self.model.kai_model = self
        utils.koboldai_vars.modeldim = self.get_hidden_size()
    def _save_settings(self):
        with open("settings/{}.generic_hf_torch.model_backend.settings".format(self.model_name.replace("/", "_")), "w") as f:
            json.dump({"layers": self.layers if 'layers' in vars(self) else [], "disk_layers": self.disk_layers if 'disk_layers' in vars(self) else 0}, f, indent="")
--- a/modeling/inference_models/gooseai/class.py
+++ b/modeling/inference_models/gooseai/class.py
@@ -11,14 +11,14 @@ from modeling.inference_model import (
    InferenceModel,
 )
-from modeling.inference_models.parents.openai_gooseai import model_backend as openai_gooseai_model_backend
+from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend
 model_backend_name = "GooseAI"
 class OpenAIAPIError(Exception):
    def __init__(self, error_type: str, error_message) -> None:
        super().__init__(f"{error_type}: {error_message}")
        self.source = "GooseAI"
 class model_backend(openai_gooseai_model_backend):
--- a/modeling/inference_models/parents/hf.py
+++ b/modeling/inference_models/parents/hf.py
@@ -3,6 +3,7 @@ from typing import Optional
 from transformers import AutoConfig
 import warnings
 import utils
 import json
 import koboldai_settings
 from logger import logger
 from modeling.inference_model import InferenceModel
@@ -44,16 +45,15 @@ class HFInferenceModel(InferenceModel):
            self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
        layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
        if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
-            if os.path.exists("settings/{}.breakmodel".format(model_name.replace("/", "_"))):
+            if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
-                with open("settings/{}.breakmodel".format(model_name.replace("/", "_")), "r") as file:
+                with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
-                    data = [x for x in file.read().split("\n")[:2] if x != '']
+                    temp = json.load(f)
-                    if len(data) < 2:
+                    break_values = temp['layers'] if 'layers' in temp else [layer_count]
-                        data.append("0")
+                    disk_blocks = temp['disk_layers'] if 'disk_layers' in temp else 0
                    break_values, disk_blocks = data
                    break_values = break_values.split(",")
            else:
                break_values = [layer_count]
                disk_blocks = 0
            break_values = [int(x) for x in break_values if x != '' and x is not None]
            gpu_count = torch.cuda.device_count()
            break_values += [0] * (gpu_count - len(break_values))
@@ -132,8 +132,15 @@ class HFInferenceModel(InferenceModel):
            if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
                gpu_count = torch.cuda.device_count()
                layers = []
                logger.info(parameters)
                for i in range(gpu_count):
-                    layers.append(int(parameters["{}_Layers".format(i)]) if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric() else None)
+                    logger.info(parameters["{}_Layers".format(i)])
                    if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric():
                        layers.append(int(parameters["{}_Layers".format(i)]))
                    elif isinstance(parameters["{}_Layers".format(i)], str):
                         layers.append(None)
                    else:
                        layers.append(parameters["{}_Layers".format(i)])
                self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None
                if isinstance(self.cpu_layers, str):
                    self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0
--- a/modeling/inference_models/hf_mtj/class.py
+++ b/modeling/inference_models/hf_mtj/class.py
@@ -16,7 +16,7 @@ from modeling.inference_model import (
    GenerationSettings,
    ModelCapabilities,
 )
-from modeling.inference_models.parents.hf import HFInferenceModel
+from modeling.inference_models.hf import HFInferenceModel
 from modeling.tokenizer import GenericTokenizer
 model_backend_name = "Huggingface MTJ"
--- a/modeling/inference_models/parents/hf_torch.py
+++ b/modeling/inference_models/parents/hf_torch.py
@@ -31,7 +31,7 @@ from modeling import warpers
 from modeling.warpers import Warper
 from modeling.stoppers import Stoppers
 from modeling.post_token_hooks import PostTokenHooks
-from modeling.inference_models.parents.hf import HFInferenceModel
+from modeling.inference_models.hf import HFInferenceModel
 from modeling.inference_model import (
    GenerationResult,
    GenerationSettings,
@@ -823,135 +823,10 @@ class HFTorchInferenceModel(HFInferenceModel):
            breakmodel.gpu_blocks = [0] * n_layers
            return
-        elif (
+        elif breakmodel.gpu_blocks != []:
            utils.args.breakmodel_gpulayers is not None
            or utils.args.breakmodel_disklayers is not None
            or breakmodel.gpu_blocks != []
        ):
            try:
                if breakmodel.gpu_blocks == []:
                    if utils.args.breakmodel_gpulayers:
                        breakmodel.gpu_blocks = list(
                            map(int, utils.args.breakmodel_gpulayers.split(","))
                        )
                assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
                s = n_layers
                for i in range(len(breakmodel.gpu_blocks)):
                    if breakmodel.gpu_blocks[i] <= -1:
                        breakmodel.gpu_blocks[i] = s
                        break
                    else:
                        s -= breakmodel.gpu_blocks[i]
                assert sum(breakmodel.gpu_blocks) <= n_layers
                n_layers -= sum(breakmodel.gpu_blocks)
                n_layers -= breakmodel.disk_blocks
            except:
                logger.warning(
                    "--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0."
                )
                breakmodel.gpu_blocks = [n_layers]
                n_layers = 0
        elif utils.args.breakmodel_layers is not None:
            breakmodel.gpu_blocks = [
                n_layers - max(0, min(n_layers, utils.args.breakmodel_layers))
            ]
            n_layers -= sum(breakmodel.gpu_blocks)
        elif utils.args.model is not None:
            logger.info("Breakmodel not specified, assuming GPU 0")
            breakmodel.gpu_blocks = [n_layers]
            n_layers = 0
        else:
            device_count = torch.cuda.device_count()
            if device_count > 1:
                print(
                    Colors.CYAN
                    + "\nPlease select one of your GPUs to be your primary GPU."
                )
                print(
                    "VRAM usage in your primary GPU will be higher than for your other ones."
                )
                print("It is recommended you make your fastest GPU your primary GPU.")
                self.breakmodel_device_list(n_layers)
                while True:
                    primaryselect = input("device ID> ")
                    if (
                        primaryselect.isnumeric()
                        and 0 <= int(primaryselect) < device_count
                    ):
                        breakmodel.primary_device = int(primaryselect)
                        break
                    else:
                        print(
                            f"{Colors.RED}Please enter an integer between 0 and {device_count-1}.{Colors.END}"
                        )
            else:
                breakmodel.primary_device = 0
            print(
                Colors.PURPLE
                + "\nIf you don't have enough VRAM to run the model on a single GPU"
            )
            print(
                "you can split the model between your CPU and your GPU(s), or between"
            )
            print("multiple GPUs if you have more than one.")
            print("By putting more 'layers' on a GPU or CPU, more computations will be")
            print(
                "done on that device and more VRAM or RAM will be required on that device"
            )
            print("(roughly proportional to number of layers).")
            print(
                "It should be noted that GPUs are orders of magnitude faster than the CPU."
            )
            print(
                f"This model has{Colors.YELLOW} {n_layers} {Colors.PURPLE}layers.{Colors.END}\n"
            )
            for i in range(device_count):
                self.breakmodel_device_list(
                    n_layers, primary=breakmodel.primary_device, selected=i
                )
                print(
                    f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
                )
                while True:
                    layerselect = input("# of layers> ")
                    if (
                        layerselect.isnumeric() or layerselect.strip() == "-1"
                    ) and -1 <= int(layerselect) <= n_layers:
                        layerselect = int(layerselect)
                        layerselect = n_layers if layerselect == -1 else layerselect
                        breakmodel.gpu_blocks.append(layerselect)
                        n_layers -= layerselect
                        break
                    else:
                        print(
                            f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
                        )
                if n_layers == 0:
                    break
            if n_layers > 0:
                self.breakmodel_device_list(
                    n_layers, primary=breakmodel.primary_device, selected=-1
                )
                print(
                    f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
                )
                while True:
                    layerselect = input("# of layers> ")
                    if (
                        layerselect.isnumeric() or layerselect.strip() == "-1"
                    ) and -1 <= int(layerselect) <= n_layers:
                        layerselect = int(layerselect)
                        layerselect = n_layers if layerselect == -1 else layerselect
                        breakmodel.disk_blocks = layerselect
                        n_layers -= layerselect
                        break
                    else:
                        print(
                            f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
                        )
        logger.init_ok("Final device configuration:", status="Info")
        self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
--- a/modeling/inference_models/horde/class.py
+++ b/modeling/inference_models/horde/class.py
@@ -38,6 +38,11 @@ class model_backend(InferenceModel):
        return model_name == "CLUSTER" or model_name in [x['value'] for x in self.models]
    def get_requested_parameters(self, model_name, model_path, menu_path):
        if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self):
            with open("settings/horde.model_backend.settings", "r") as f:
                temp = json.load(f)
                self.base_url = temp['url']
                self.key = temp['key']
        requested_parameters = []
        requested_parameters.extend([{
                                        "uitype": "text",
@@ -122,6 +127,10 @@ class model_backend(InferenceModel):
            #else "gpt2",
        )
    def _save_settings(self):
        with open("settings/horde.model_backend.settings", "w") as f:
            json.dump({"key": self.key, "url": self.url}, f, indent="")
    def _raw_generate(
        self,
        prompt_tokens: Union[List[int], torch.Tensor],
--- a/modeling/inference_models/openai/class.py
+++ b/modeling/inference_models/openai/class.py
@@ -11,13 +11,14 @@ from modeling.inference_model import (
    InferenceModel,
 )
-from modeling.inference_models.parents.openai_gooseai import model_backend as openai_gooseai_model_backend
+from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend
 model_backend_name = "OpenAI"
 class OpenAIAPIError(Exception):
    def __init__(self, error_type: str, error_message) -> None:
        super().__init__(f"{error_type}: {error_message}")
        self.source = "OpenAI"
 class model_backend(openai_gooseai_model_backend):
--- a/modeling/inference_models/parents/openai_gooseai.py
+++ b/modeling/inference_models/parents/openai_gooseai.py
@@ -25,15 +25,14 @@ class model_backend(InferenceModel):
        super().__init__()
        self.key = ""
        self.url = "https://api.goose.ai/v1/engines"
        #if self.source == 'OAI':
        #    url = "https://api.openai.com/v1/engines"
        #elif self.source == 'GooseAI':
        #    url = "https://api.goose.ai/v1/engines"
    def is_valid(self, model_name, model_path, menu_path):
        return model_name == "OAI" or model_name == "GooseAI"
    def get_requested_parameters(self, model_name, model_path, menu_path):
        if os.path.exists("settings/{}.model_backend.settings".format(self.source)) and 'colaburl' not in vars(self):
            with open("settings/{}.model_backend.settings".format(self.source), "r") as f:
                self.key = json.load(f)['key']
        self.source = model_name
        requested_parameters = []
        requested_parameters.extend([{
@@ -41,7 +40,7 @@ class model_backend(InferenceModel):
                                        "unit": "text",
                                        "label": "Key",
                                        "id": "key",
-                                        "default": "",
+                                        "default": self.key,
                                        "check": {"value": "", 'check': "!="},
                                        "tooltip": "User Key to use when connecting to OpenAI/GooseAI.",
                                        "menu_path": "",
@@ -106,6 +105,10 @@ class model_backend(InferenceModel):
    def _load(self, save_model: bool, initial_load: bool) -> None:
        self.tokenizer = self._get_tokenizer("gpt2")
    def _save_settings(self):
        with open("settings/{}.model_backend.settings".format(self.source), "w") as f:
            json.dump({"key": self.key}, f, indent="")
    def _raw_generate(
        self,
        prompt_tokens: Union[List[int], torch.Tensor],
--- a/modeling/inference_models/readonly/class.py
+++ b/modeling/inference_models/readonly/class.py
--- a/modeling/inference_models/rwkv.py
+++ b/modeling/inference_models/rwkv.py
@@ -1,258 +0,0 @@
 from __future__ import annotations
 import os
 import time
 from typing import Dict, List, Optional, Union
 import numpy as np
 import requests
 from tokenizers import Tokenizer
 from tqdm import tqdm
 from huggingface_hub import hf_hub_url
 import torch
 from torch.nn import functional as F
 # Must be defined before import
 os.environ["RWKV_JIT_ON"] = "1"
 # TODO: Include compiled kernel
 os.environ["RWKV_CUDA_ON"] = "1"
 import utils
 from logger import logger
 from modeling import warpers
 from modeling.warpers import Warper
 from modeling.stoppers import Stoppers
 from modeling.post_token_hooks import PostTokenHooks
 from modeling.tokenizer import GenericTokenizer
 from modeling.inference_model import (
    GenerationResult,
    GenerationSettings,
    InferenceModel,
    ModelCapabilities,
 )
 TOKENIZER_URL = (
    "https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/20B_tokenizer.json"
 )
 TOKENIZER_PATH = "models/rwkv/20b_tokenizer.json"
 REPO_OWNER = "BlinkDL"
 MODEL_FILES = {
    "rwkv-4-pile-14b": "RWKV-4-Pile-14B-20230213-8019.pth",
    # NOTE: Still in progress(?)
    "rwkv-4-pile-14b:ctx4096": "RWKV-4-Pile-14B-20230228-ctx4096-test663.pth",
    "rwkv-4-pile-7b": "RWKV-4-Pile-7B-20221115-8047.pth",
    "rwkv-4-pile-7b:ctx4096": "RWKV-4-Pile-7B-20230109-ctx4096.pth",
    "rwkv-4-pile-3b": "RWKV-4-Pile-3B-20221008-8023.pth",
    "rwkv-4-pile-3b:ctx4096": "RWKV-4-Pile-3B-20221110-ctx4096.pth",
    "rwkv-4-pile-1b5": "RWKV-4-Pile-1B5-20220903-8040.pth",
    "rwkv-4-pile-1b5:ctx4096": "RWKV-4-Pile-1B5-20220929-ctx4096.pth",
    "rwkv-4-pile-430m": "RWKV-4-Pile-430M-20220808-8066.pth",
    "rwkv-4-pile-169m": "RWKV-4-Pile-169M-20220807-8023.pth",
 }
 model_backend_name = "RWKV"
 class model_backend(InferenceModel):
    def __init__(
        self,
        #model_name: str,
    ) -> None:
        super().__init__()
        #self.model_name = model_name
        self.post_token_hooks = [
            PostTokenHooks.stream_tokens,
        ]
        self.stopper_hooks = [
            Stoppers.core_stopper,
            Stoppers.dynamic_wi_scanner,
            Stoppers.singleline_stopper,
            Stoppers.chat_mode_stopper,
            Stoppers.stop_sequence_stopper,
        ]
        self.capabilties = ModelCapabilities(
            embedding_manipulation=False,
            post_token_hooks=True,
            stopper_hooks=True,
            post_token_probs=True,
        )
        self._old_stopping_criteria = None
    def is_valid(self, model_name, model_path, menu_path):
        try:
            from rwkv.model import RWKV
            valid = True
        except:
            valid = False
        return valid and "rwkv" in model_name.lower()
    def get_requested_parameters(self, model_name, model_path, menu_path):
        self.source = model_name
        requested_parameters = []
        return requested_parameters
    def set_input_parameters(self):
        return
    def _ensure_directory_structure(self) -> None:
        for path in ["models/rwkv", "models/rwkv/models"]:
            try:
                os.mkdir(path)
            except FileExistsError:
                pass
    def _get_tokenizer(self) -> GenericTokenizer:
        if not os.path.exists(TOKENIZER_PATH):
            logger.info("RWKV tokenizer not found, downloading...")
            r = requests.get(TOKENIZER_URL)
            with open(TOKENIZER_PATH, "wb") as file:
                file.write(r.content)
        return GenericTokenizer(Tokenizer.from_file(TOKENIZER_PATH))
    def _download_model(self, model_path: str, model_class: str) -> None:
        logger.info(f"{self.model_name} not found, downloading...")
        url = hf_hub_url(
            repo_id=f"{REPO_OWNER}/{model_class}",
            filename=MODEL_FILES[self.model_name],
        )
        # TODO: Use aria2
        # https://stackoverflow.com/a/57030446
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            bar = tqdm(
                desc="Downloading RWKV Model",
                unit="B",
                unit_scale=True,
                total=int(r.headers["Content-Length"]),
            )
            with open(model_path, "wb") as file:
                for chunk in r.iter_content(chunk_size=8192):
                    if not chunk:
                        continue
                    file.write(chunk)
                    bar.update(len(chunk))
    def _load(self, save_model: bool, initial_load: bool) -> None:
        self._ensure_directory_structure()
        self.tokenizer = self._get_tokenizer()
        # Parse model name
        model_class, _, special = self.model_name.partition(":")
        special = special or None
        model_dir = os.path.join("models", "rwkv", "models", model_class)
        if not os.path.exists(model_dir):
            os.mkdir(model_dir)
        # Download model if we need to
        model_path = os.path.join(model_dir, MODEL_FILES[self.model_name])
        if not os.path.exists(model_path):
            self._download_model(model_path, model_class)
        # Now we load!
        # TODO: Breakmodel to strat
        from rwkv.model import RWKV
        self.model = RWKV(model=model_path, strategy="cuda:0 fp16")
    def _apply_warpers(
        self, scores: torch.Tensor, input_ids: torch.Tensor
    ) -> torch.Tensor:
        warpers.update_settings()
        for sid in utils.koboldai_vars.sampler_order:
            warper = Warper.from_id(sid)
            if not warper.value_is_valid():
                continue
            if warper == warpers.RepetitionPenalty:
                # Rep pen needs more data than other samplers
                scores = warper.torch(scores, input_ids=input_ids)
            else:
                scores = warper.torch(scores)
        return scores
    def _sample_token(self, logits: torch.Tensor, input_ids: torch.Tensor) -> int:
        probs = F.softmax(logits.float(), dim=-1)
        if probs.device == torch.device("cpu"):
            probs = probs.numpy()
            sorted_ids = np.argsort(probs)
            sorted_probs = probs[sorted_ids][::-1]
            probs = self._apply_warpers(probs[None, :], input_ids)
            # TODO: is this right?
            probs[probs == -torch.inf] = 0.0
            probs = probs / np.sum(probs)
            out = np.random.choice(a=len(probs), p=probs)
            return int(out)
        else:
            sorted_ids = torch.argsort(probs)
            sorted_probs = probs[sorted_ids]
            sorted_probs = torch.flip(sorted_probs, dims=(0,))
            probs = self._apply_warpers(probs[None, :], input_ids)
            # TODO: is this right?
            probs[probs == -torch.inf] = 0.0
            out = torch.multinomial(probs, num_samples=1)[0]
            return int(out)
    def _raw_generate(
        self,
        prompt_tokens: Union[List[int], torch.Tensor],
        max_new: int,
        gen_settings: GenerationSettings,
        single_line: bool = False,
        batch_count: int = 1,
        seed: Optional[int] = None,
        **kwargs,
    ) -> GenerationResult:
        if seed is not None:
            torch.manual_seed(seed)
        aux_device = utils.get_auxilary_device()
        context = torch.tensor(prompt_tokens)[None, :].to(aux_device)
        out = []
        start_time = time.time()
        with torch.no_grad():
            logits, state = self.model.forward(prompt_tokens, None)
            last_token = prompt_tokens[-1]
            for _ in range(max_new):
                logits, state = self.model.forward([last_token], state)
                last_token = self._sample_token(logits, context)
                out.append(last_token)
                add = torch.tensor([[last_token]]).to(aux_device)
                context = torch.cat((context, add), dim=-1)
                self._post_token_gen(context)
        logger.debug(
            "torch_raw_generate: run generator {}s".format(time.time() - start_time)
        )
        return GenerationResult(
            self,
            out_batches=torch.tensor([out]),
            prompt=prompt_tokens,
            is_whole_generation=False,
            output_includes_prompt=True,
        )