Moved model backends to separate folders

added some model backend settings save/load
2025-06-05 21:59:24 +02:00 · 2023-05-18 20:14:33 -04:00
parent 4040538d34
commit 06f59a7b7b
14 changed files with 69 additions and 409 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -626,9 +626,9 @@ import importlib
 model_backend_code = {}
 model_backends = {}
 for module in os.listdir("./modeling/inference_models"):
-    if os.path.isfile(os.path.join("./modeling/inference_models",module)) and module[-3:] == '.py':
-        model_backend_code[module[:-3]] = importlib.import_module('modeling.inference_models.{}'.format(module[:-3]))
-        model_backends[model_backend_code[module[:-3]].model_backend_name] = model_backend_code[module[:-3]].model_backend()
+    if not os.path.isfile(os.path.join("./modeling/inference_models",module)) and module != '__pycache__':
+        model_backend_code[module] = importlib.import_module('modeling.inference_models.{}.class'.format(module))
+        model_backends[model_backend_code[module].model_backend_name] = model_backend_code[module].model_backend()
        

 old_socketio_on = socketio.on
--- a/modeling/inference_model.py
+++ b/modeling/inference_model.py
@@ -188,6 +188,7 @@ class InferenceModel:
        self._pre_load()
        self._load(save_model=save_model, initial_load=initial_load)
        self._post_load()
+        self._save_settings()

    def unload(self):
        return
@@ -198,6 +199,9 @@ class InferenceModel:
    def _post_load(self) -> None:
        """Post load hook. Called after `_load()`."""
    
+    def _save_settings(self) -> None:
+        """Save settings hook. Called after `_post_load()`."""
+
    def _load(self, save_model: bool, initial_load: bool) -> None:
        """Main load method. All logic related to loading the model onto the
        selected device(s) and preparing it for inference should be implemented here."""
--- a/modeling/inference_models/api/class.py
+++ b/modeling/inference_models/api/class.py
@@ -26,19 +26,22 @@ class APIException(Exception):
 class model_backend(InferenceModel):
    def __init__(self) -> None:
        super().__init__()
-        #self.base_url = ""
+        self.base_url = ""

    def is_valid(self, model_name, model_path, menu_path):
        return model_name == "API"
    
    def get_requested_parameters(self, model_name, model_path, menu_path):
+        if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self):
+            with open("settings/api.model_backend.settings", "r") as f:
+                self.base_url = json.load(f)['base_url']
        requested_parameters = []
        requested_parameters.append({
                                        "uitype": "text",
                                        "unit": "text",
                                        "label": "URL",
                                        "id": "base_url",
-                                        "default": False,
+                                        "default": self.base_url,
                                        "check": {"value": "", 'check': "!="},
                                        "tooltip": "The URL of the KoboldAI API to connect to.",
                                        "menu_path": "",
@@ -58,6 +61,10 @@ class model_backend(InferenceModel):
        # Do not allow API to be served over the API
        self.capabilties = ModelCapabilities(api_host=False)

+    def _save_settings(self):
+        with open("settings/api.model_backend.settings", "w") as f:
+            json.dump({"base_url": self.base_url}, f, indent="")
+
    def _raw_generate(
        self,
        prompt_tokens: Union[List[int], torch.Tensor],
--- a/modeling/inference_models/basic_api/class.py
+++ b/modeling/inference_models/basic_api/class.py
@@ -24,6 +24,7 @@ class BasicAPIException(Exception):
 class model_backend(InferenceModel):
    def __init__(self) -> None:
        super().__init__()
+        self.colaburl = ""

        # Do not allow API to be served over the API
        self.capabilties = ModelCapabilities(api_host=False)
@@ -32,13 +33,16 @@ class model_backend(InferenceModel):
        return model_name == "Colab"
    
    def get_requested_parameters(self, model_name, model_path, menu_path):
+        if os.path.exists("settings/api.model_backend.settings") and 'colaburl' not in vars(self):
+            with open("settings/api.model_backend.settings", "r") as f:
+                self.colaburl = json.load(f)['base_url']
        requested_parameters = []
        requested_parameters.append({
                                        "uitype": "text",
                                        "unit": "text",
                                        "label": "URL",
                                        "id": "colaburl",
-                                        "default": False,
+                                        "default": self.colaburl,
                                        "check": {"value": "", 'check': "!="},
                                        "tooltip": "The URL of the Colab KoboldAI API to connect to.",
                                        "menu_path": "",
@@ -56,6 +60,10 @@ class model_backend(InferenceModel):
    def _load(self, save_model: bool, initial_load: bool) -> None:
        self.tokenizer = self._get_tokenizer("EleutherAI/gpt-neo-2.7B")
    
+    def _save_settings(self):
+        with open("settings/basic_api.model_backend.settings", "w") as f:
+            json.dump({"colaburl": self.colaburl}, f, indent="")
+
    def _raw_generate(
        self,
        prompt_tokens: Union[List[int], torch.Tensor],
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -20,7 +20,7 @@ except ModuleNotFoundError as e:
    if not utils.koboldai_vars.use_colab_tpu:
        raise e

-from modeling.inference_models.parents.hf_torch import HFTorchInferenceModel
+from modeling.inference_models.hf_torch import HFTorchInferenceModel

 model_backend_name = "Huggingface"

@@ -270,3 +270,7 @@ class model_backend(HFTorchInferenceModel):

        self.model.kai_model = self
        utils.koboldai_vars.modeldim = self.get_hidden_size()
+
+    def _save_settings(self):
+        with open("settings/{}.generic_hf_torch.model_backend.settings".format(self.model_name.replace("/", "_")), "w") as f:
+            json.dump({"layers": self.layers if 'layers' in vars(self) else [], "disk_layers": self.disk_layers if 'disk_layers' in vars(self) else 0}, f, indent="")
--- a/modeling/inference_models/gooseai/class.py
+++ b/modeling/inference_models/gooseai/class.py
@@ -11,14 +11,14 @@ from modeling.inference_model import (
    InferenceModel,
 )

-from modeling.inference_models.parents.openai_gooseai import model_backend as openai_gooseai_model_backend
-
+from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend

 model_backend_name = "GooseAI"

 class OpenAIAPIError(Exception):
    def __init__(self, error_type: str, error_message) -> None:
        super().__init__(f"{error_type}: {error_message}")
+        self.source = "GooseAI"


 class model_backend(openai_gooseai_model_backend):
--- a/modeling/inference_models/parents/hf.py
+++ b/modeling/inference_models/parents/hf.py
@@ -3,6 +3,7 @@ from typing import Optional
 from transformers import AutoConfig
 import warnings
 import utils
+import json
 import koboldai_settings
 from logger import logger
 from modeling.inference_model import InferenceModel
@@ -44,16 +45,15 @@ class HFInferenceModel(InferenceModel):
            self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
        layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
        if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
-            if os.path.exists("settings/{}.breakmodel".format(model_name.replace("/", "_"))):
-                with open("settings/{}.breakmodel".format(model_name.replace("/", "_")), "r") as file:
-                    data = [x for x in file.read().split("\n")[:2] if x != '']
-                    if len(data) < 2:
-                        data.append("0")
-                    break_values, disk_blocks = data
-                    break_values = break_values.split(",")
+            if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
+                with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
+                    temp = json.load(f)
+                    break_values = temp['layers'] if 'layers' in temp else [layer_count]
+                    disk_blocks = temp['disk_layers'] if 'disk_layers' in temp else 0
            else:
                break_values = [layer_count]
                disk_blocks = 0
+            
            break_values = [int(x) for x in break_values if x != '' and x is not None]
            gpu_count = torch.cuda.device_count()
            break_values += [0] * (gpu_count - len(break_values))
@@ -132,8 +132,15 @@ class HFInferenceModel(InferenceModel):
            if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
                gpu_count = torch.cuda.device_count()
                layers = []
+                logger.info(parameters)
                for i in range(gpu_count):
-                    layers.append(int(parameters["{}_Layers".format(i)]) if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric() else None)
+                    logger.info(parameters["{}_Layers".format(i)])
+                    if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric():
+                        layers.append(int(parameters["{}_Layers".format(i)]))
+                    elif isinstance(parameters["{}_Layers".format(i)], str):
+                         layers.append(None)
+                    else:
+                        layers.append(parameters["{}_Layers".format(i)])
                self.cpu_layers = parameters['CPU_Layers'] if 'CPU_Layers' in parameters else None
                if isinstance(self.cpu_layers, str):
                    self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0
--- a/modeling/inference_models/hf_mtj/class.py
+++ b/modeling/inference_models/hf_mtj/class.py
@@ -16,7 +16,7 @@ from modeling.inference_model import (
    GenerationSettings,
    ModelCapabilities,
 )
-from modeling.inference_models.parents.hf import HFInferenceModel
+from modeling.inference_models.hf import HFInferenceModel
 from modeling.tokenizer import GenericTokenizer

 model_backend_name = "Huggingface MTJ"
--- a/modeling/inference_models/parents/hf_torch.py
+++ b/modeling/inference_models/parents/hf_torch.py
@@ -31,7 +31,7 @@ from modeling import warpers
 from modeling.warpers import Warper
 from modeling.stoppers import Stoppers
 from modeling.post_token_hooks import PostTokenHooks
-from modeling.inference_models.parents.hf import HFInferenceModel
+from modeling.inference_models.hf import HFInferenceModel
 from modeling.inference_model import (
    GenerationResult,
    GenerationSettings,
@@ -823,135 +823,10 @@ class HFTorchInferenceModel(HFInferenceModel):
            breakmodel.gpu_blocks = [0] * n_layers
            return

-        elif (
-            utils.args.breakmodel_gpulayers is not None
-            or utils.args.breakmodel_disklayers is not None
-            or breakmodel.gpu_blocks != []
-        ):
-            try:
-                if breakmodel.gpu_blocks == []:
-                    if utils.args.breakmodel_gpulayers:
-                        breakmodel.gpu_blocks = list(
-                            map(int, utils.args.breakmodel_gpulayers.split(","))
-                        )
-                assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
-                s = n_layers
-                for i in range(len(breakmodel.gpu_blocks)):
-                    if breakmodel.gpu_blocks[i] <= -1:
-                        breakmodel.gpu_blocks[i] = s
-                        break
-                    else:
-                        s -= breakmodel.gpu_blocks[i]
-                assert sum(breakmodel.gpu_blocks) <= n_layers
-                n_layers -= sum(breakmodel.gpu_blocks)
-                n_layers -= breakmodel.disk_blocks
-            except:
-                logger.warning(
-                    "--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0."
-                )
-                breakmodel.gpu_blocks = [n_layers]
-                n_layers = 0
-        elif utils.args.breakmodel_layers is not None:
-            breakmodel.gpu_blocks = [
-                n_layers - max(0, min(n_layers, utils.args.breakmodel_layers))
-            ]
-            n_layers -= sum(breakmodel.gpu_blocks)
-        elif utils.args.model is not None:
+        elif breakmodel.gpu_blocks != []:
            logger.info("Breakmodel not specified, assuming GPU 0")
            breakmodel.gpu_blocks = [n_layers]
            n_layers = 0
-        else:
-            device_count = torch.cuda.device_count()
-            if device_count > 1:
-                print(
-                    Colors.CYAN
-                    + "\nPlease select one of your GPUs to be your primary GPU."
-                )
-                print(
-                    "VRAM usage in your primary GPU will be higher than for your other ones."
-                )
-                print("It is recommended you make your fastest GPU your primary GPU.")
-                self.breakmodel_device_list(n_layers)
-                while True:
-                    primaryselect = input("device ID> ")
-                    if (
-                        primaryselect.isnumeric()
-                        and 0 <= int(primaryselect) < device_count
-                    ):
-                        breakmodel.primary_device = int(primaryselect)
-                        break
-                    else:
-                        print(
-                            f"{Colors.RED}Please enter an integer between 0 and {device_count-1}.{Colors.END}"
-                        )
-            else:
-                breakmodel.primary_device = 0
-
-            print(
-                Colors.PURPLE
-                + "\nIf you don't have enough VRAM to run the model on a single GPU"
-            )
-            print(
-                "you can split the model between your CPU and your GPU(s), or between"
-            )
-            print("multiple GPUs if you have more than one.")
-            print("By putting more 'layers' on a GPU or CPU, more computations will be")
-            print(
-                "done on that device and more VRAM or RAM will be required on that device"
-            )
-            print("(roughly proportional to number of layers).")
-            print(
-                "It should be noted that GPUs are orders of magnitude faster than the CPU."
-            )
-            print(
-                f"This model has{Colors.YELLOW} {n_layers} {Colors.PURPLE}layers.{Colors.END}\n"
-            )
-
-            for i in range(device_count):
-                self.breakmodel_device_list(
-                    n_layers, primary=breakmodel.primary_device, selected=i
-                )
-                print(
-                    f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
-                )
-                while True:
-                    layerselect = input("# of layers> ")
-                    if (
-                        layerselect.isnumeric() or layerselect.strip() == "-1"
-                    ) and -1 <= int(layerselect) <= n_layers:
-                        layerselect = int(layerselect)
-                        layerselect = n_layers if layerselect == -1 else layerselect
-                        breakmodel.gpu_blocks.append(layerselect)
-                        n_layers -= layerselect
-                        break
-                    else:
-                        print(
-                            f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
-                        )
-                if n_layers == 0:
-                    break
-
-            if n_layers > 0:
-                self.breakmodel_device_list(
-                    n_layers, primary=breakmodel.primary_device, selected=-1
-                )
-                print(
-                    f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
-                )
-                while True:
-                    layerselect = input("# of layers> ")
-                    if (
-                        layerselect.isnumeric() or layerselect.strip() == "-1"
-                    ) and -1 <= int(layerselect) <= n_layers:
-                        layerselect = int(layerselect)
-                        layerselect = n_layers if layerselect == -1 else layerselect
-                        breakmodel.disk_blocks = layerselect
-                        n_layers -= layerselect
-                        break
-                    else:
-                        print(
-                            f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
-                        )

        logger.init_ok("Final device configuration:", status="Info")
        self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
--- a/modeling/inference_models/horde/class.py
+++ b/modeling/inference_models/horde/class.py
@@ -38,6 +38,11 @@ class model_backend(InferenceModel):
        return model_name == "CLUSTER" or model_name in [x['value'] for x in self.models]
    
    def get_requested_parameters(self, model_name, model_path, menu_path):
+        if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self):
+            with open("settings/horde.model_backend.settings", "r") as f:
+                temp = json.load(f)
+                self.base_url = temp['url']
+                self.key = temp['key']
        requested_parameters = []
        requested_parameters.extend([{
                                        "uitype": "text",
@@ -122,6 +127,10 @@ class model_backend(InferenceModel):
            #else "gpt2",
        )

+    def _save_settings(self):
+        with open("settings/horde.model_backend.settings", "w") as f:
+            json.dump({"key": self.key, "url": self.url}, f, indent="")
+
    def _raw_generate(
        self,
        prompt_tokens: Union[List[int], torch.Tensor],
--- a/modeling/inference_models/openai/class.py
+++ b/modeling/inference_models/openai/class.py
@@ -11,13 +11,14 @@ from modeling.inference_model import (
    InferenceModel,
 )

-from modeling.inference_models.parents.openai_gooseai import model_backend as openai_gooseai_model_backend
+from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend

 model_backend_name = "OpenAI"

 class OpenAIAPIError(Exception):
    def __init__(self, error_type: str, error_message) -> None:
        super().__init__(f"{error_type}: {error_message}")
+        self.source = "OpenAI"


 class model_backend(openai_gooseai_model_backend):
--- a/modeling/inference_models/parents/openai_gooseai.py
+++ b/modeling/inference_models/parents/openai_gooseai.py
@@ -25,15 +25,14 @@ class model_backend(InferenceModel):
        super().__init__()
        self.key = ""
        self.url = "https://api.goose.ai/v1/engines"
-        #if self.source == 'OAI':
-        #    url = "https://api.openai.com/v1/engines"
-        #elif self.source == 'GooseAI':
-        #    url = "https://api.goose.ai/v1/engines"
    
    def is_valid(self, model_name, model_path, menu_path):
        return model_name == "OAI" or model_name == "GooseAI"
    
    def get_requested_parameters(self, model_name, model_path, menu_path):
+        if os.path.exists("settings/{}.model_backend.settings".format(self.source)) and 'colaburl' not in vars(self):
+            with open("settings/{}.model_backend.settings".format(self.source), "r") as f:
+                self.key = json.load(f)['key']
        self.source = model_name
        requested_parameters = []
        requested_parameters.extend([{
@@ -41,7 +40,7 @@ class model_backend(InferenceModel):
                                        "unit": "text",
                                        "label": "Key",
                                        "id": "key",
-                                        "default": "",
+                                        "default": self.key,
                                        "check": {"value": "", 'check': "!="},
                                        "tooltip": "User Key to use when connecting to OpenAI/GooseAI.",
                                        "menu_path": "",
@@ -106,6 +105,10 @@ class model_backend(InferenceModel):
    def _load(self, save_model: bool, initial_load: bool) -> None:
        self.tokenizer = self._get_tokenizer("gpt2")

+    def _save_settings(self):
+        with open("settings/{}.model_backend.settings".format(self.source), "w") as f:
+            json.dump({"key": self.key}, f, indent="")
+
    def _raw_generate(
        self,
        prompt_tokens: Union[List[int], torch.Tensor],
--- a/modeling/inference_models/readonly/class.py
+++ b/modeling/inference_models/readonly/class.py
--- a/modeling/inference_models/rwkv.py
+++ b/modeling/inference_models/rwkv.py
@@ -1,258 +0,0 @@
-from __future__ import annotations
-import os
-
-
-import time
-from typing import Dict, List, Optional, Union
-import numpy as np
-import requests
-from tokenizers import Tokenizer
-from tqdm import tqdm
-from huggingface_hub import hf_hub_url
-
-import torch
-from torch.nn import functional as F
-
-# Must be defined before import
-os.environ["RWKV_JIT_ON"] = "1"
-# TODO: Include compiled kernel
-os.environ["RWKV_CUDA_ON"] = "1"
-
-
-import utils
-from logger import logger
-
-from modeling import warpers
-from modeling.warpers import Warper
-from modeling.stoppers import Stoppers
-from modeling.post_token_hooks import PostTokenHooks
-from modeling.tokenizer import GenericTokenizer
-from modeling.inference_model import (
-    GenerationResult,
-    GenerationSettings,
-    InferenceModel,
-    ModelCapabilities,
-)
-
-TOKENIZER_URL = (
-    "https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/20B_tokenizer.json"
-)
-TOKENIZER_PATH = "models/rwkv/20b_tokenizer.json"
-
-REPO_OWNER = "BlinkDL"
-MODEL_FILES = {
-    "rwkv-4-pile-14b": "RWKV-4-Pile-14B-20230213-8019.pth",
-    # NOTE: Still in progress(?)
-    "rwkv-4-pile-14b:ctx4096": "RWKV-4-Pile-14B-20230228-ctx4096-test663.pth",
-    "rwkv-4-pile-7b": "RWKV-4-Pile-7B-20221115-8047.pth",
-    "rwkv-4-pile-7b:ctx4096": "RWKV-4-Pile-7B-20230109-ctx4096.pth",
-    "rwkv-4-pile-3b": "RWKV-4-Pile-3B-20221008-8023.pth",
-    "rwkv-4-pile-3b:ctx4096": "RWKV-4-Pile-3B-20221110-ctx4096.pth",
-    "rwkv-4-pile-1b5": "RWKV-4-Pile-1B5-20220903-8040.pth",
-    "rwkv-4-pile-1b5:ctx4096": "RWKV-4-Pile-1B5-20220929-ctx4096.pth",
-    "rwkv-4-pile-430m": "RWKV-4-Pile-430M-20220808-8066.pth",
-    "rwkv-4-pile-169m": "RWKV-4-Pile-169M-20220807-8023.pth",
-}
-
-
-model_backend_name = "RWKV"
-
-
-class model_backend(InferenceModel):
-    def __init__(
-        self,
-        #model_name: str,
-    ) -> None:
-        super().__init__()
-        #self.model_name = model_name
-
-        self.post_token_hooks = [
-            PostTokenHooks.stream_tokens,
-        ]
-
-        self.stopper_hooks = [
-            Stoppers.core_stopper,
-            Stoppers.dynamic_wi_scanner,
-            Stoppers.singleline_stopper,
-            Stoppers.chat_mode_stopper,
-            Stoppers.stop_sequence_stopper,
-        ]
-
-        self.capabilties = ModelCapabilities(
-            embedding_manipulation=False,
-            post_token_hooks=True,
-            stopper_hooks=True,
-            post_token_probs=True,
-        )
-        self._old_stopping_criteria = None
-
-    def is_valid(self, model_name, model_path, menu_path):
-        try:
-            from rwkv.model import RWKV
-            valid = True
-        except:
-            valid = False
-        return valid and "rwkv" in model_name.lower()
-    
-    def get_requested_parameters(self, model_name, model_path, menu_path):
-        self.source = model_name
-        requested_parameters = []
-        return requested_parameters
-        
-    def set_input_parameters(self):
-        return
-
-
-    def _ensure_directory_structure(self) -> None:
-        for path in ["models/rwkv", "models/rwkv/models"]:
-            try:
-                os.mkdir(path)
-            except FileExistsError:
-                pass
-
-    def _get_tokenizer(self) -> GenericTokenizer:
-        if not os.path.exists(TOKENIZER_PATH):
-            logger.info("RWKV tokenizer not found, downloading...")
-
-            r = requests.get(TOKENIZER_URL)
-            with open(TOKENIZER_PATH, "wb") as file:
-                file.write(r.content)
-
-        return GenericTokenizer(Tokenizer.from_file(TOKENIZER_PATH))
-
-    def _download_model(self, model_path: str, model_class: str) -> None:
-        logger.info(f"{self.model_name} not found, downloading...")
-
-        url = hf_hub_url(
-            repo_id=f"{REPO_OWNER}/{model_class}",
-            filename=MODEL_FILES[self.model_name],
-        )
-
-        # TODO: Use aria2
-        # https://stackoverflow.com/a/57030446
-        with requests.get(url, stream=True) as r:
-            r.raise_for_status()
-            bar = tqdm(
-                desc="Downloading RWKV Model",
-                unit="B",
-                unit_scale=True,
-                total=int(r.headers["Content-Length"]),
-            )
-            with open(model_path, "wb") as file:
-                for chunk in r.iter_content(chunk_size=8192):
-                    if not chunk:
-                        continue
-                    file.write(chunk)
-                    bar.update(len(chunk))
-
-    def _load(self, save_model: bool, initial_load: bool) -> None:
-        self._ensure_directory_structure()
-        self.tokenizer = self._get_tokenizer()
-
-        # Parse model name
-        model_class, _, special = self.model_name.partition(":")
-        special = special or None
-
-        model_dir = os.path.join("models", "rwkv", "models", model_class)
-        if not os.path.exists(model_dir):
-            os.mkdir(model_dir)
-
-        # Download model if we need to
-        model_path = os.path.join(model_dir, MODEL_FILES[self.model_name])
-        if not os.path.exists(model_path):
-            self._download_model(model_path, model_class)
-
-        # Now we load!
-
-        # TODO: Breakmodel to strat
-        from rwkv.model import RWKV
-        self.model = RWKV(model=model_path, strategy="cuda:0 fp16")
-
-    def _apply_warpers(
-        self, scores: torch.Tensor, input_ids: torch.Tensor
-    ) -> torch.Tensor:
-        warpers.update_settings()
-        for sid in utils.koboldai_vars.sampler_order:
-            warper = Warper.from_id(sid)
-
-            if not warper.value_is_valid():
-                continue
-
-            if warper == warpers.RepetitionPenalty:
-                # Rep pen needs more data than other samplers
-                scores = warper.torch(scores, input_ids=input_ids)
-            else:
-                scores = warper.torch(scores)
-        return scores
-
-    def _sample_token(self, logits: torch.Tensor, input_ids: torch.Tensor) -> int:
-        probs = F.softmax(logits.float(), dim=-1)
-
-        if probs.device == torch.device("cpu"):
-            probs = probs.numpy()
-            sorted_ids = np.argsort(probs)
-            sorted_probs = probs[sorted_ids][::-1]
-
-            probs = self._apply_warpers(probs[None, :], input_ids)
-
-            # TODO: is this right?
-            probs[probs == -torch.inf] = 0.0
-
-            probs = probs / np.sum(probs)
-            out = np.random.choice(a=len(probs), p=probs)
-            return int(out)
-        else:
-            sorted_ids = torch.argsort(probs)
-            sorted_probs = probs[sorted_ids]
-            sorted_probs = torch.flip(sorted_probs, dims=(0,))
-
-            probs = self._apply_warpers(probs[None, :], input_ids)
-
-            # TODO: is this right?
-            probs[probs == -torch.inf] = 0.0
-
-            out = torch.multinomial(probs, num_samples=1)[0]
-            return int(out)
-
-    def _raw_generate(
-        self,
-        prompt_tokens: Union[List[int], torch.Tensor],
-        max_new: int,
-        gen_settings: GenerationSettings,
-        single_line: bool = False,
-        batch_count: int = 1,
-        seed: Optional[int] = None,
-        **kwargs,
-    ) -> GenerationResult:
-        if seed is not None:
-            torch.manual_seed(seed)
-
-        aux_device = utils.get_auxilary_device()
-        context = torch.tensor(prompt_tokens)[None, :].to(aux_device)
-        out = []
-
-        start_time = time.time()
-        with torch.no_grad():
-            logits, state = self.model.forward(prompt_tokens, None)
-            last_token = prompt_tokens[-1]
-
-            for _ in range(max_new):
-
-                logits, state = self.model.forward([last_token], state)
-                last_token = self._sample_token(logits, context)
-                out.append(last_token)
-                add = torch.tensor([[last_token]]).to(aux_device)
-                context = torch.cat((context, add), dim=-1)
-                self._post_token_gen(context)
-
-        logger.debug(
-            "torch_raw_generate: run generator {}s".format(time.time() - start_time)
-        )
-
-        return GenerationResult(
-            self,
-            out_batches=torch.tensor([out]),
-            prompt=prompt_tokens,
-            is_whole_generation=False,
-            output_includes_prompt=True,
-        )