mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Merge ebolam's model-plugins branch
This commit is contained in:
@@ -6,6 +6,7 @@ import torch
|
||||
import requests
|
||||
import numpy as np
|
||||
from typing import List, Optional, Union
|
||||
import os
|
||||
|
||||
import utils
|
||||
from logger import logger
|
||||
@@ -17,15 +18,42 @@ from modeling.inference_model import (
|
||||
ModelCapabilities,
|
||||
)
|
||||
|
||||
model_backend_name = "KoboldAI API"
|
||||
|
||||
class APIException(Exception):
|
||||
"""To be used for errors when using the Kobold API as an interface."""
|
||||
|
||||
|
||||
class APIInferenceModel(InferenceModel):
|
||||
def __init__(self, base_url: str) -> None:
|
||||
class model_backend(InferenceModel):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.base_url = ""
|
||||
self.model_name = "KoboldAI API"
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
return model_name == "API"
|
||||
|
||||
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
||||
if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self):
|
||||
with open("settings/api.model_backend.settings", "r") as f:
|
||||
self.base_url = json.load(f)['base_url']
|
||||
requested_parameters = []
|
||||
requested_parameters.append({
|
||||
"uitype": "text",
|
||||
"unit": "text",
|
||||
"label": "URL",
|
||||
"id": "base_url",
|
||||
"default": self.base_url,
|
||||
"check": {"value": "", 'check': "!="},
|
||||
"tooltip": "The URL of the KoboldAI API to connect to.",
|
||||
"menu_path": "",
|
||||
"extra_classes": "",
|
||||
"refresh_model_inputs": False
|
||||
})
|
||||
return requested_parameters
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
self.base_url = parameters['base_url'].rstrip("/")
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
tokenizer_id = requests.get(f"{self.base_url}/api/v1/model").json()["result"]
|
||||
@@ -35,6 +63,10 @@ class APIInferenceModel(InferenceModel):
|
||||
# Do not allow API to be served over the API
|
||||
self.capabilties = ModelCapabilities(api_host=False)
|
||||
|
||||
def _save_settings(self):
|
||||
with open("settings/api.model_backend.settings", "w") as f:
|
||||
json.dump({"base_url": self.base_url}, f, indent="")
|
||||
|
||||
def _raw_generate(
|
||||
self,
|
||||
prompt_tokens: Union[List[int], torch.Tensor],
|
@@ -4,6 +4,7 @@ import torch
|
||||
import requests
|
||||
import numpy as np
|
||||
from typing import List, Optional, Union
|
||||
import os
|
||||
|
||||
import utils
|
||||
from logger import logger
|
||||
@@ -15,19 +16,54 @@ from modeling.inference_model import (
|
||||
)
|
||||
|
||||
|
||||
model_backend_name = "KoboldAI Old Colab Method"
|
||||
|
||||
class BasicAPIException(Exception):
|
||||
"""To be used for errors when using the Basic API as an interface."""
|
||||
|
||||
|
||||
class BasicAPIInferenceModel(InferenceModel):
|
||||
class model_backend(InferenceModel):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.colaburl = ""
|
||||
|
||||
# Do not allow API to be served over the API
|
||||
self.capabilties = ModelCapabilities(api_host=False)
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
return model_name == "Colab"
|
||||
|
||||
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
||||
if os.path.exists("settings/api.model_backend.settings") and 'colaburl' not in vars(self):
|
||||
with open("settings/api.model_backend.settings", "r") as f:
|
||||
self.colaburl = json.load(f)['base_url']
|
||||
requested_parameters = []
|
||||
requested_parameters.append({
|
||||
"uitype": "text",
|
||||
"unit": "text",
|
||||
"label": "URL",
|
||||
"id": "colaburl",
|
||||
"default": self.colaburl,
|
||||
"check": {"value": "", 'check': "!="},
|
||||
"tooltip": "The URL of the Colab KoboldAI API to connect to.",
|
||||
"menu_path": "",
|
||||
"extra_classes": "",
|
||||
"refresh_model_inputs": False
|
||||
})
|
||||
return requested_parameters
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
self.colaburl = parameters['colaburl']
|
||||
|
||||
def _initialize_model(self):
|
||||
return
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
self.tokenizer = self._get_tokenizer("EleutherAI/gpt-neo-2.7B")
|
||||
|
||||
def _save_settings(self):
|
||||
with open("settings/basic_api.model_backend.settings", "w") as f:
|
||||
json.dump({"colaburl": self.colaburl}, f, indent="")
|
||||
|
||||
def _raw_generate(
|
||||
self,
|
||||
@@ -68,7 +104,7 @@ class BasicAPIInferenceModel(InferenceModel):
|
||||
}
|
||||
|
||||
# Create request
|
||||
req = requests.post(utils.koboldai_vars.colaburl, json=reqdata)
|
||||
req = requests.post(self.colaburl, json=reqdata)
|
||||
|
||||
if req.status_code != 200:
|
||||
raise BasicAPIException(f"Bad status code {req.status_code}")
|
@@ -23,6 +23,7 @@ except ModuleNotFoundError as e:
|
||||
|
||||
from modeling.inference_models.hf_torch import HFTorchInferenceModel
|
||||
|
||||
model_backend_name = "Huggingface"
|
||||
|
||||
class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
||||
def load_config(self) -> None:
|
||||
@@ -37,9 +38,9 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
||||
|
||||
if self.model_name == "NeoCustom":
|
||||
self.model_name = os.path.basename(
|
||||
os.path.normpath(utils.koboldai_vars.custmodpth)
|
||||
os.path.normpath(self.path)
|
||||
)
|
||||
utils.koboldai_vars.model = self.model_name
|
||||
utils.koboldai_vars.model = self.model_name
|
||||
|
||||
# If we specify a model and it's in the root directory, we need to move
|
||||
# it to the models directory (legacy folder structure to new)
|
||||
@@ -51,14 +52,11 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
||||
|
||||
self.init_model_config()
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
self.load_config()
|
||||
|
||||
tf_kwargs = {
|
||||
"low_cpu_mem_usage": True,
|
||||
}
|
||||
|
||||
if utils.koboldai_vars.model_type == "gpt2":
|
||||
if self.model_type == "gpt2":
|
||||
# We must disable low_cpu_mem_usage and if using a GPT-2 model
|
||||
# because GPT-2 is not compatible with this feature yet.
|
||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
||||
@@ -68,12 +66,14 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
||||
|
||||
# If we're using torch_lazy_loader, we need to get breakmodel config
|
||||
# early so that it knows where to load the individual model tensors
|
||||
logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
|
||||
if (
|
||||
self.lazy_load
|
||||
and utils.koboldai_vars.hascuda
|
||||
and utils.koboldai_vars.breakmodel
|
||||
and not utils.koboldai_vars.nobreakmodel
|
||||
and self.breakmodel
|
||||
and not self.nobreakmodel
|
||||
):
|
||||
logger.debug("loading breakmodel")
|
||||
self.breakmodel_device_config(self.model_config)
|
||||
|
||||
if self.lazy_load:
|
||||
@@ -250,11 +250,12 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
||||
|
||||
self.patch_embedding()
|
||||
|
||||
|
||||
if utils.koboldai_vars.hascuda:
|
||||
if utils.koboldai_vars.usegpu:
|
||||
if self.usegpu:
|
||||
# Use just VRAM
|
||||
self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
|
||||
elif utils.koboldai_vars.breakmodel:
|
||||
elif self.breakmodel:
|
||||
# Use both RAM and VRAM (breakmodel)
|
||||
if not self.lazy_load:
|
||||
self.breakmodel_device_config(self.model.config)
|
||||
@@ -269,6 +270,11 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
||||
self._move_to_devices()
|
||||
else:
|
||||
self.model = self.model.to("cpu").float()
|
||||
|
||||
|
||||
|
||||
self.model.kai_model = self
|
||||
utils.koboldai_vars.modeldim = self.get_hidden_size()
|
||||
|
||||
def _save_settings(self):
|
||||
with open("settings/{}.generic_hf_torch.model_backend.settings".format(self.model_name.replace("/", "_")), "w") as f:
|
||||
json.dump({"layers": self.layers if 'layers' in vars(self) else [], "disk_layers": self.disk_layers if 'disk_layers' in vars(self) else 0}, f, indent="")
|
33
modeling/inference_models/gooseai/class.py
Normal file
33
modeling/inference_models/gooseai/class.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import torch
|
||||
import requests
|
||||
import numpy as np
|
||||
from typing import List, Optional, Union
|
||||
import os
|
||||
|
||||
import utils
|
||||
from logger import logger
|
||||
from modeling.inference_model import (
|
||||
GenerationResult,
|
||||
GenerationSettings,
|
||||
InferenceModel,
|
||||
)
|
||||
|
||||
from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend
|
||||
|
||||
model_backend_name = "GooseAI"
|
||||
|
||||
class OpenAIAPIError(Exception):
|
||||
def __init__(self, error_type: str, error_message) -> None:
|
||||
super().__init__(f"{error_type}: {error_message}")
|
||||
|
||||
|
||||
class model_backend(openai_gooseai_model_backend):
|
||||
"""InferenceModel for interfacing with OpenAI's generation API."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.url = "https://api.goose.ai/v1/engines"
|
||||
self.source = "GooseAI"
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
return model_name == "GooseAI"
|
@@ -1,25 +1,230 @@
|
||||
import os
|
||||
import os, sys
|
||||
from typing import Optional
|
||||
<<<<<<< HEAD
|
||||
from hf_bleeding_edge import AutoConfig
|
||||
|
||||
=======
|
||||
from transformers import AutoConfig
|
||||
import warnings
|
||||
>>>>>>> ebolam/Model_Plugins
|
||||
import utils
|
||||
import json
|
||||
import koboldai_settings
|
||||
from logger import logger
|
||||
from modeling.inference_model import InferenceModel
|
||||
import torch
|
||||
import gc
|
||||
|
||||
|
||||
class HFInferenceModel(InferenceModel):
|
||||
def __init__(self, model_name: str) -> None:
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.model_config = None
|
||||
self.model_name = model_name
|
||||
#self.model_name = model_name
|
||||
|
||||
self.model = None
|
||||
self.tokenizer = None
|
||||
self.badwordsids = koboldai_settings.badwordsids_default
|
||||
self.usegpu = False
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
try:
|
||||
if model_path is not None and os.path.exists(model_path):
|
||||
self.model_config = AutoConfig.from_pretrained(model_path)
|
||||
elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
|
||||
self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
|
||||
else:
|
||||
self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
||||
requested_parameters = []
|
||||
if not self.hf_torch:
|
||||
return []
|
||||
if model_name == 'customhuggingface':
|
||||
requested_parameters.append({
|
||||
"uitype": "text",
|
||||
"unit": "text",
|
||||
"label": "Huggingface Model Name",
|
||||
"id": "custom_model_name",
|
||||
"default": parameters["custom_model_name"] if "custom_model_name" in parameters and parameters["custom_model_name"] != "" else "",
|
||||
"check": {"value": "", 'check': "!="},
|
||||
"tooltip": "Model name from https://huggingface.co/",
|
||||
"menu_path": "",
|
||||
"refresh_model_inputs": True,
|
||||
"extra_classes": ""
|
||||
})
|
||||
|
||||
if model_name != 'customhuggingface' or "custom_model_name" in parameters:
|
||||
model_name = parameters["custom_model_name"] if "custom_model_name" in parameters and parameters["custom_model_name"] != "" else model_name
|
||||
if model_path is not None and os.path.exists(model_path):
|
||||
self.model_config = AutoConfig.from_pretrained(model_path)
|
||||
elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
|
||||
self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
|
||||
else:
|
||||
self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
|
||||
layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
|
||||
layer_count = None if hasattr(self, "get_model_type") and self.get_model_type() == "gpt2" else layer_count #Skip layers if we're a GPT2 model as it doesn't support breakmodel
|
||||
if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
|
||||
if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
|
||||
with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
|
||||
temp = json.load(f)
|
||||
break_values = temp['layers'] if 'layers' in temp else [layer_count]
|
||||
disk_blocks = temp['disk_layers'] if 'disk_layers' in temp else 0
|
||||
else:
|
||||
break_values = [layer_count]
|
||||
disk_blocks = 0
|
||||
|
||||
break_values = [int(x) for x in break_values if x != '' and x is not None]
|
||||
gpu_count = torch.cuda.device_count()
|
||||
break_values += [0] * (gpu_count - len(break_values))
|
||||
if disk_blocks is not None:
|
||||
break_values += [int(disk_blocks)]
|
||||
requested_parameters.append({
|
||||
"uitype": "Valid Display",
|
||||
"unit": "text",
|
||||
"label": "Current Allocated Layers: %1/{}".format(layer_count), #%1 will be the validation value
|
||||
"id": "valid_layers",
|
||||
"max": layer_count,
|
||||
"step": 1,
|
||||
"check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
|
||||
"menu_path": "Layers",
|
||||
"extra_classes": "",
|
||||
"refresh_model_inputs": False
|
||||
})
|
||||
for i in range(gpu_count):
|
||||
requested_parameters.append({
|
||||
"uitype": "slider",
|
||||
"unit": "int",
|
||||
"label": "{} Layers".format(torch.cuda.get_device_name(i)),
|
||||
"id": "{}_Layers".format(i),
|
||||
"min": 0,
|
||||
"max": layer_count,
|
||||
"step": 1,
|
||||
"check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
|
||||
"check_message": "The sum of assigned layers must equal {}".format(layer_count),
|
||||
"default": break_values[i],
|
||||
"tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)),
|
||||
"menu_path": "Layers",
|
||||
"extra_classes": "",
|
||||
"refresh_model_inputs": False
|
||||
})
|
||||
requested_parameters.append({
|
||||
"uitype": "slider",
|
||||
"unit": "int",
|
||||
"label": "CPU Layers",
|
||||
"id": "CPU_Layers",
|
||||
"min": 0,
|
||||
"max": layer_count,
|
||||
"step": 1,
|
||||
"check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
|
||||
"check_message": "The sum of assigned layers must equal {}".format(layer_count),
|
||||
"default": layer_count - sum(break_values),
|
||||
"tooltip": "The number of layers to put on the CPU. This will use your system RAM. It will also do inference partially on CPU. Use if you must.",
|
||||
"menu_path": "Layers",
|
||||
"extra_classes": "",
|
||||
"refresh_model_inputs": False
|
||||
})
|
||||
if disk_blocks is not None:
|
||||
requested_parameters.append({
|
||||
"uitype": "slider",
|
||||
"unit": "int",
|
||||
"label": "Disk Layers",
|
||||
"id": "Disk_Layers",
|
||||
"min": 0,
|
||||
"max": layer_count,
|
||||
"step": 1,
|
||||
"check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
|
||||
"check_message": "The sum of assigned layers must equal {}".format(layer_count),
|
||||
"default": disk_blocks,
|
||||
"tooltip": "The number of layers to put on the disk. This will use your hard drive. The is VERY slow in comparison to GPU or CPU. Use as a last resort.",
|
||||
"menu_path": "Layers",
|
||||
"extra_classes": "",
|
||||
"refresh_model_inputs": False
|
||||
})
|
||||
else:
|
||||
requested_parameters.append({
|
||||
"uitype": "toggle",
|
||||
"unit": "bool",
|
||||
"label": "Use GPU",
|
||||
"id": "use_gpu",
|
||||
"default": True,
|
||||
"tooltip": "Whether or not to use the GPU",
|
||||
"menu_path": "Layers",
|
||||
"extra_classes": "",
|
||||
"refresh_model_inputs": False
|
||||
})
|
||||
|
||||
|
||||
return requested_parameters
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
if self.hf_torch and hasattr(self, "get_model_type") and self.get_model_type() != "gpt2":
|
||||
import breakmodel
|
||||
layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
|
||||
if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
|
||||
gpu_count = torch.cuda.device_count()
|
||||
layers = []
|
||||
for i in range(gpu_count):
|
||||
if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric():
|
||||
layers.append(int(parameters["{}_Layers".format(i)]))
|
||||
elif isinstance(parameters["{}_Layers".format(i)], str):
|
||||
layers.append(None)
|
||||
else:
|
||||
layers.append(parameters["{}_Layers".format(i)])
|
||||
self.cpu_layers = int(parameters['CPU_Layers']) if 'CPU_Layers' in parameters else None
|
||||
if isinstance(self.cpu_layers, str):
|
||||
self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0
|
||||
self.layers = layers
|
||||
self.disk_layers = parameters['Disk_Layers'] if 'Disk_Layers' in parameters else 0
|
||||
if isinstance(self.disk_layers, str):
|
||||
self.disk_layers = int(self.disk_layers) if self.disk_layers.isnumeric() else 0
|
||||
breakmodel.gpu_blocks = layers
|
||||
breakmodel.disk_blocks = self.disk_layers
|
||||
self.usegpu = self.cpu_layers == 0 and breakmodel.disk_blocks == 0 and sum(self.layers)-self.layers[0] == 0
|
||||
self.model_type = self.get_model_type()
|
||||
self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel
|
||||
self.lazy_load = True
|
||||
logger.debug("Model type: {}".format(self.model_type))
|
||||
else:
|
||||
logger.debug("Disabling breakmodel and lazyload")
|
||||
self.usegpu = parameters['use_gpu'] if 'use_gpu' in parameters else None
|
||||
self.breakmodel = False
|
||||
self.lazy_load = False
|
||||
logger.info(parameters)
|
||||
self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
|
||||
self.path = parameters['path'] if 'path' in parameters else None
|
||||
|
||||
def unload(self):
|
||||
if hasattr(self, 'model'):
|
||||
self.model = None
|
||||
if hasattr(self, 'tokenizer'):
|
||||
self.tokenizer = None
|
||||
if hasattr(self, 'model_config'):
|
||||
self.model_config = None
|
||||
with torch.no_grad():
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated")
|
||||
for tensor in gc.get_objects():
|
||||
try:
|
||||
if torch.is_tensor(tensor):
|
||||
tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype))
|
||||
except:
|
||||
pass
|
||||
gc.collect()
|
||||
try:
|
||||
with torch.no_grad():
|
||||
torch.cuda.empty_cache()
|
||||
except:
|
||||
pass
|
||||
|
||||
def _post_load(self) -> None:
|
||||
self.badwordsids = koboldai_settings.badwordsids_default
|
||||
self.model_type = str(self.model_config.model_type)
|
||||
# These are model specific tokenizer overrides if a model has bad defaults
|
||||
if utils.koboldai_vars.model_type == "llama":
|
||||
if self.model_type == "llama":
|
||||
# Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
|
||||
self.tokenizer.add_bos_token = False
|
||||
|
||||
@@ -59,7 +264,7 @@ class HFInferenceModel(InferenceModel):
|
||||
token_ids = [first]
|
||||
elif len(token_ids) > 0:
|
||||
first = int(token_ids[0])
|
||||
elif token_ids:
|
||||
elif token_ids is not None and len(token_ids) > 0:
|
||||
first = token_ids[0]
|
||||
result = original_decode(self, token_ids, *args, **kwargs)
|
||||
if first is not None and first in has_prefix_space:
|
||||
@@ -103,32 +308,32 @@ class HFInferenceModel(InferenceModel):
|
||||
return result
|
||||
object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
|
||||
|
||||
elif utils.koboldai_vars.model_type == "opt":
|
||||
elif self.model_type == "opt":
|
||||
self.tokenizer._koboldai_header = self.tokenizer.encode("")
|
||||
self.tokenizer.add_bos_token = False
|
||||
self.tokenizer.add_prefix_space = False
|
||||
|
||||
# Change newline behavior to match model quirks
|
||||
if utils.koboldai_vars.model_type == "xglm":
|
||||
if self.model_type == "xglm":
|
||||
# Default to </s> newline mode if using XGLM
|
||||
utils.koboldai_vars.newlinemode = "s"
|
||||
elif utils.koboldai_vars.model_type in ["opt", "bloom"]:
|
||||
elif self.model_type in ["opt", "bloom"]:
|
||||
# Handle </s> but don't convert newlines if using Fairseq models that have newlines trained in them
|
||||
utils.koboldai_vars.newlinemode = "ns"
|
||||
|
||||
# Clean up tokens that cause issues
|
||||
if (
|
||||
utils.koboldai_vars.badwordsids == koboldai_settings.badwordsids_default
|
||||
and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj")
|
||||
self.badwordsids == koboldai_settings.badwordsids_default
|
||||
and self.model_type not in ("gpt2", "gpt_neo", "gptj")
|
||||
):
|
||||
utils.koboldai_vars.badwordsids = [
|
||||
self.badwordsids = [
|
||||
[v]
|
||||
for k, v in self.tokenizer.get_vocab().items()
|
||||
if any(c in str(k) for c in "[]")
|
||||
]
|
||||
|
||||
if utils.koboldai_vars.newlinemode == "n":
|
||||
utils.koboldai_vars.badwordsids.append([self.tokenizer.eos_token_id])
|
||||
self.badwordsids.append([self.tokenizer.eos_token_id])
|
||||
|
||||
return super()._post_load()
|
||||
|
||||
@@ -139,9 +344,12 @@ class HFInferenceModel(InferenceModel):
|
||||
Returns a string of the model's path locally, or None if it is not downloaded.
|
||||
If ignore_existance is true, it will always return a path.
|
||||
"""
|
||||
if self.path is not None:
|
||||
if os.path.exists(self.path):
|
||||
return self.path
|
||||
|
||||
if self.model_name in ["NeoCustom", "GPT2Custom", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]:
|
||||
model_path = utils.koboldai_vars.custmodpth
|
||||
model_path = self.path
|
||||
assert model_path
|
||||
|
||||
# Path can be absolute or relative to models directory
|
||||
@@ -158,7 +366,7 @@ class HFInferenceModel(InferenceModel):
|
||||
|
||||
return model_path
|
||||
|
||||
basename = utils.koboldai_vars.model.replace("/", "_")
|
||||
basename = self.model_name.replace("/", "_")
|
||||
if legacy:
|
||||
ret = basename
|
||||
else:
|
||||
@@ -176,24 +384,25 @@ class HFInferenceModel(InferenceModel):
|
||||
revision=utils.koboldai_vars.revision,
|
||||
cache_dir="cache",
|
||||
)
|
||||
utils.koboldai_vars.model_type = self.model_config.model_type
|
||||
|
||||
self.model_type = self.model_config.model_type
|
||||
|
||||
if "gptq_bits" in dir(self.model_config):
|
||||
utils.koboldai_vars.gptq_model = True
|
||||
utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
|
||||
utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
|
||||
utils.koboldai_vars.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
|
||||
utils.koboldai_vars.gptq_file = None
|
||||
self.gptq_model = True
|
||||
self.gptq_bits = self.model_config.gptq_bits
|
||||
self.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
|
||||
self.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
|
||||
self.gptq_file = None
|
||||
else:
|
||||
utils.koboldai_vars.gptq_model = False
|
||||
self.gptq_model = False
|
||||
except ValueError:
|
||||
utils.koboldai_vars.model_type = {
|
||||
self.model_type = {
|
||||
"NeoCustom": "gpt_neo",
|
||||
"GPT2Custom": "gpt2",
|
||||
}.get(utils.koboldai_vars.model)
|
||||
}.get(self.model)
|
||||
|
||||
if not utils.koboldai_vars.model_type:
|
||||
if not self.model_type:
|
||||
logger.warning(
|
||||
"No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)"
|
||||
)
|
||||
utils.koboldai_vars.model_type = "gpt_neo"
|
||||
self.model_type = "gpt_neo"
|
||||
|
@@ -17,19 +17,18 @@ from modeling.inference_model import (
|
||||
ModelCapabilities,
|
||||
)
|
||||
from modeling.inference_models.hf import HFInferenceModel
|
||||
from modeling.tokenizer import GenericTokenizer
|
||||
|
||||
# This file shouldn't be imported unless using the TPU
|
||||
assert utils.koboldai_vars.use_colab_tpu
|
||||
import tpu_mtj_backend
|
||||
model_backend_name = "Huggingface MTJ"
|
||||
|
||||
|
||||
class HFMTJInferenceModel(HFInferenceModel):
|
||||
class model_backend(HFInferenceModel):
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
#model_name: str,
|
||||
) -> None:
|
||||
super().__init__(model_name)
|
||||
|
||||
super().__init__()
|
||||
self.hf_torch = False
|
||||
self.model_config = None
|
||||
self.capabilties = ModelCapabilities(
|
||||
embedding_manipulation=False,
|
||||
@@ -38,8 +37,13 @@ class HFMTJInferenceModel(HFInferenceModel):
|
||||
post_token_probs=False,
|
||||
uses_tpu=True,
|
||||
)
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
# This file shouldn't be imported unless using the TPU
|
||||
return utils.koboldai_vars.use_colab_tpu and super().is_valid(model_name, model_path, menu_path)
|
||||
|
||||
def setup_mtj(self) -> None:
|
||||
import tpu_mtj_backend
|
||||
def mtj_warper_callback(scores) -> "np.array":
|
||||
scores_shape = scores.shape
|
||||
scores_list = scores.tolist()
|
||||
@@ -146,7 +150,7 @@ class HFMTJInferenceModel(HFInferenceModel):
|
||||
|
||||
tpu_mtj_backend.socketio = utils.socketio
|
||||
|
||||
if utils.koboldai_vars.model == "TPUMeshTransformerGPTNeoX":
|
||||
if self.model_name == "TPUMeshTransformerGPTNeoX":
|
||||
utils.koboldai_vars.badwordsids = utils.koboldai_vars.badwordsids_neox
|
||||
|
||||
print(
|
||||
@@ -154,7 +158,7 @@ class HFMTJInferenceModel(HFInferenceModel):
|
||||
Colors.PURPLE, Colors.END
|
||||
)
|
||||
)
|
||||
if utils.koboldai_vars.model in (
|
||||
if self.model_name in (
|
||||
"TPUMeshTransformerGPTJ",
|
||||
"TPUMeshTransformerGPTNeoX",
|
||||
) and (
|
||||
@@ -164,7 +168,7 @@ class HFMTJInferenceModel(HFInferenceModel):
|
||||
raise FileNotFoundError(
|
||||
f"The specified model path {repr(utils.koboldai_vars.custmodpth)} is not the path to a valid folder"
|
||||
)
|
||||
if utils.koboldai_vars.model == "TPUMeshTransformerGPTNeoX":
|
||||
if self.model_name == "TPUMeshTransformerGPTNeoX":
|
||||
tpu_mtj_backend.pad_token_id = 2
|
||||
|
||||
tpu_mtj_backend.koboldai_vars = utils.koboldai_vars
|
||||
@@ -175,13 +179,15 @@ class HFMTJInferenceModel(HFInferenceModel):
|
||||
tpu_mtj_backend.settings_callback = mtj_settings_callback
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
import tpu_mtj_backend
|
||||
self.setup_mtj()
|
||||
self.init_model_config()
|
||||
utils.koboldai_vars.allowsp = True
|
||||
|
||||
logger.info(self.model_name)
|
||||
tpu_mtj_backend.load_model(
|
||||
utils.koboldai_vars.model,
|
||||
hf_checkpoint=utils.koboldai_vars.model
|
||||
self.model_name,
|
||||
hf_checkpoint=self.model_name
|
||||
not in ("TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX")
|
||||
and utils.koboldai_vars.use_colab_tpu,
|
||||
socketio_queue=koboldai_settings.queue,
|
||||
@@ -193,12 +199,11 @@ class HFMTJInferenceModel(HFInferenceModel):
|
||||
utils.koboldai_vars.modeldim = int(
|
||||
tpu_mtj_backend.params.get("d_embed", tpu_mtj_backend.params["d_model"])
|
||||
)
|
||||
|
||||
self.tokenizer = tpu_mtj_backend.tokenizer
|
||||
self.tokenizer = GenericTokenizer(tpu_mtj_backend.tokenizer)
|
||||
|
||||
if (
|
||||
utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default
|
||||
and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj")
|
||||
and self.model_type not in ("gpt2", "gpt_neo", "gptj")
|
||||
):
|
||||
utils.koboldai_vars.badwordsids = [
|
||||
[v]
|
||||
@@ -207,6 +212,7 @@ class HFMTJInferenceModel(HFInferenceModel):
|
||||
]
|
||||
|
||||
def get_soft_tokens(self) -> np.array:
|
||||
import tpu_mtj_backend
|
||||
soft_tokens = None
|
||||
|
||||
if utils.koboldai_vars.sp is None:
|
||||
@@ -258,6 +264,7 @@ class HFMTJInferenceModel(HFInferenceModel):
|
||||
seed: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> GenerationResult:
|
||||
import tpu_mtj_backend
|
||||
warpers.update_settings()
|
||||
|
||||
soft_tokens = self.get_soft_tokens()
|
@@ -53,15 +53,12 @@ LOG_SAMPLER_NO_EFFECT = False
|
||||
|
||||
|
||||
class HFTorchInferenceModel(HFInferenceModel):
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
lazy_load: bool,
|
||||
low_mem: bool,
|
||||
) -> None:
|
||||
super().__init__(model_name)
|
||||
self.lazy_load = lazy_load
|
||||
self.low_mem = low_mem
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.hf_torch = True
|
||||
self.lazy_load = True
|
||||
self.low_mem = False
|
||||
self.nobreakmodel = False
|
||||
|
||||
self.post_token_hooks = [
|
||||
PostTokenHooks.stream_tokens,
|
||||
@@ -128,7 +125,19 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
else:
|
||||
return "Unknown"
|
||||
|
||||
def get_auxilary_device(self):
|
||||
"""Get device auxilary tensors like inputs should be stored on."""
|
||||
|
||||
# NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU.
|
||||
if utils.koboldai_vars.hascuda and self.usegpu:
|
||||
return utils.koboldai_vars.gpu_device
|
||||
elif utils.koboldai_vars.hascuda and self.breakmodel:
|
||||
import breakmodel
|
||||
return breakmodel.primary_device
|
||||
return "cpu"
|
||||
|
||||
def _post_load(m_self) -> None:
|
||||
|
||||
if not utils.koboldai_vars.model_type:
|
||||
utils.koboldai_vars.model_type = m_self.get_model_type()
|
||||
|
||||
@@ -228,7 +237,7 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
else:
|
||||
gen_in = prompt_tokens
|
||||
|
||||
device = utils.get_auxilary_device()
|
||||
device = self.get_auxilary_device()
|
||||
gen_in = gen_in.to(device)
|
||||
|
||||
additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else []
|
||||
@@ -245,7 +254,7 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
len(prompt_tokens) + max_new, utils.koboldai_vars.max_length
|
||||
),
|
||||
repetition_penalty=1.0,
|
||||
bad_words_ids=utils.koboldai_vars.badwordsids
|
||||
bad_words_ids=self.badwordsids
|
||||
+ additional_bad_words_ids,
|
||||
use_cache=True,
|
||||
num_return_sequences=batch_count,
|
||||
@@ -291,11 +300,14 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
logger.error("Invalid load key! Aborting.")
|
||||
raise
|
||||
|
||||
logger.warning(f"Fell back to GPT2LMHeadModel due to {traceback.format_exc()}")
|
||||
logger.warning(f"Fell back to GPT2LMHeadModel due to {e}")
|
||||
logger.debug(traceback.format_exc())
|
||||
|
||||
try:
|
||||
return GPT2LMHeadModel.from_pretrained(location, **tf_kwargs)
|
||||
except Exception as e:
|
||||
logger.warning(f"Fell back to GPTNeoForCausalLM due to {e}")
|
||||
logger.debug(traceback.format_exc())
|
||||
return GPTNeoForCausalLM.from_pretrained(location, **tf_kwargs)
|
||||
|
||||
def get_hidden_size(self) -> int:
|
||||
@@ -401,8 +413,6 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
if not self.lazy_load:
|
||||
return
|
||||
|
||||
if utils.args.breakmodel_disklayers is not None:
|
||||
breakmodel.disk_blocks = utils.args.breakmodel_disklayers
|
||||
|
||||
disk_blocks = breakmodel.disk_blocks
|
||||
gpu_blocks = breakmodel.gpu_blocks
|
||||
@@ -422,31 +432,37 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
device_map: Dict[str, Union[str, int]] = {}
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def get_original_key(key):
|
||||
def get_original_key(key) -> Optional[str]:
|
||||
try:
|
||||
return max(
|
||||
(
|
||||
original_key
|
||||
for original_key in utils.module_names
|
||||
if original_key.endswith(key)
|
||||
),
|
||||
key=len,
|
||||
)
|
||||
key_candidates = [
|
||||
original_key
|
||||
for original_key in utils.module_names
|
||||
if original_key.endswith(key)
|
||||
]
|
||||
except ValueError:
|
||||
return key
|
||||
|
||||
if not key_candidates:
|
||||
logger.debug(f"!!! No key candidates for {key}")
|
||||
return None
|
||||
|
||||
return max(key_candidates, key=len)
|
||||
|
||||
for key, value in model_dict.items():
|
||||
original_key = get_original_key(key)
|
||||
|
||||
if not original_key:
|
||||
continue
|
||||
|
||||
if isinstance(value, lazy_loader.LazyTensor) and not any(
|
||||
original_key.startswith(n) for n in utils.layers_module_names
|
||||
):
|
||||
device_map[key] = (
|
||||
utils.koboldai_vars.gpu_device
|
||||
if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu
|
||||
if utils.koboldai_vars.hascuda and self.usegpu
|
||||
else "cpu"
|
||||
if not utils.koboldai_vars.hascuda
|
||||
or not utils.koboldai_vars.breakmodel
|
||||
or not self.breakmodel
|
||||
else breakmodel.primary_device
|
||||
)
|
||||
else:
|
||||
@@ -462,12 +478,12 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
)
|
||||
device = (
|
||||
utils.koboldai_vars.gpu_device
|
||||
if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu
|
||||
if utils.koboldai_vars.hascuda and self.usegpu
|
||||
else "disk"
|
||||
if layer < disk_blocks and layer < ram_blocks
|
||||
else "cpu"
|
||||
if not utils.koboldai_vars.hascuda
|
||||
or not utils.koboldai_vars.breakmodel
|
||||
or not self.breakmodel
|
||||
else "shared"
|
||||
if layer < ram_blocks
|
||||
else bisect.bisect_right(
|
||||
@@ -535,11 +551,9 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
last_storage_key = storage_key
|
||||
if isinstance(f, zipfile.ZipExtFile):
|
||||
f.close()
|
||||
try:
|
||||
f = z.open(f"archive/data/{storage_key}")
|
||||
except:
|
||||
ziproot = z.namelist()[0].split("/")[0]
|
||||
f = z.open(f"{ziproot}/data/{storage_key}")
|
||||
ziproot = z.namelist()[0].split("/")[0]
|
||||
f = z.open(f"{ziproot}/data/{storage_key}")
|
||||
|
||||
current_offset = 0
|
||||
if current_offset != model_dict[key].seek_offset:
|
||||
f.read(model_dict[key].seek_offset - current_offset)
|
||||
@@ -563,6 +577,7 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
)
|
||||
)
|
||||
# print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
|
||||
#logger.debug(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ")
|
||||
model_dict[key] = model_dict[key].materialize(
|
||||
f, map_location="cpu"
|
||||
)
|
||||
@@ -573,15 +588,15 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
and breakmodel.primary_device != "cpu"
|
||||
and utils.koboldai_vars.hascuda
|
||||
and (
|
||||
utils.koboldai_vars.breakmodel
|
||||
or utils.koboldai_vars.usegpu
|
||||
self.breakmodel
|
||||
or self.usegpu
|
||||
)
|
||||
and model_dict[key].dtype is torch.float32
|
||||
):
|
||||
model_dict[key] = model_dict[key].to(torch.float16)
|
||||
if breakmodel.primary_device == "cpu" or (
|
||||
not utils.koboldai_vars.usegpu
|
||||
and not utils.koboldai_vars.breakmodel
|
||||
not self.usegpu
|
||||
and not self.breakmodel
|
||||
and model_dict[key].dtype is torch.float16
|
||||
):
|
||||
model_dict[key] = model_dict[key].to(torch.float32)
|
||||
@@ -619,14 +634,14 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
and breakmodel.primary_device != "cpu"
|
||||
and utils.koboldai_vars.hascuda
|
||||
and (
|
||||
utils.koboldai_vars.breakmodel
|
||||
or utils.koboldai_vars.usegpu
|
||||
self.breakmodel
|
||||
or self.usegpu
|
||||
)
|
||||
):
|
||||
dtype = torch.float16
|
||||
if breakmodel.primary_device == "cpu" or (
|
||||
not utils.koboldai_vars.usegpu
|
||||
and not utils.koboldai_vars.breakmodel
|
||||
not self.usegpu
|
||||
and not self.breakmodel
|
||||
):
|
||||
dtype = torch.float32
|
||||
if (
|
||||
@@ -682,16 +697,16 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
and breakmodel.primary_device != "cpu"
|
||||
and utils.koboldai_vars.hascuda
|
||||
and (
|
||||
utils.koboldai_vars.breakmodel
|
||||
or utils.koboldai_vars.usegpu
|
||||
self.breakmodel
|
||||
or self.usegpu
|
||||
)
|
||||
and model_dict[key].dtype is torch.float32
|
||||
):
|
||||
model_dict[key] = model_dict[key].to(torch.float16)
|
||||
|
||||
if breakmodel.primary_device == "cpu" or (
|
||||
not utils.koboldai_vars.usegpu
|
||||
and not utils.koboldai_vars.breakmodel
|
||||
not self.usegpu
|
||||
and not self.breakmodel
|
||||
and model_dict[key].dtype is torch.float16
|
||||
):
|
||||
model_dict[key] = model_dict[key].to(torch.float32)
|
||||
@@ -730,14 +745,14 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
and breakmodel.primary_device != "cpu"
|
||||
and utils.koboldai_vars.hascuda
|
||||
and (
|
||||
utils.koboldai_vars.breakmodel
|
||||
or utils.koboldai_vars.usegpu
|
||||
self.breakmodel
|
||||
or self.usegpu
|
||||
)
|
||||
):
|
||||
dtype = torch.float16
|
||||
if breakmodel.primary_device == "cpu" or (
|
||||
not utils.koboldai_vars.usegpu
|
||||
and not utils.koboldai_vars.breakmodel
|
||||
not self.usegpu
|
||||
and not self.breakmodel
|
||||
):
|
||||
dtype = torch.float32
|
||||
if (
|
||||
@@ -771,7 +786,7 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
if always_use or (
|
||||
utils.koboldai_vars.hascuda
|
||||
and self.low_mem
|
||||
and (utils.koboldai_vars.usegpu or utils.koboldai_vars.breakmodel)
|
||||
and (self.usegpu or self.breakmodel)
|
||||
):
|
||||
original_dtype = torch.get_default_dtype()
|
||||
torch.set_default_dtype(torch.float16)
|
||||
@@ -786,6 +801,8 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
device_count = torch.cuda.device_count()
|
||||
if device_count < 2:
|
||||
primary = None
|
||||
logger.debug("n_layers: {}".format(n_layers))
|
||||
logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks))
|
||||
gpu_blocks = breakmodel.gpu_blocks + (
|
||||
device_count - len(breakmodel.gpu_blocks)
|
||||
) * [0]
|
||||
@@ -816,155 +833,47 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
|
||||
n_layers = utils.num_layers(config)
|
||||
|
||||
logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks))
|
||||
|
||||
if utils.args.cpu:
|
||||
breakmodel.gpu_blocks = [0] * n_layers
|
||||
return
|
||||
|
||||
elif (
|
||||
utils.args.breakmodel_gpulayers is not None
|
||||
or utils.args.breakmodel_disklayers is not None
|
||||
):
|
||||
try:
|
||||
if not utils.args.breakmodel_gpulayers:
|
||||
breakmodel.gpu_blocks = []
|
||||
else:
|
||||
breakmodel.gpu_blocks = list(
|
||||
map(int, utils.args.breakmodel_gpulayers.split(","))
|
||||
)
|
||||
assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
|
||||
s = n_layers
|
||||
for i in range(len(breakmodel.gpu_blocks)):
|
||||
if breakmodel.gpu_blocks[i] <= -1:
|
||||
breakmodel.gpu_blocks[i] = s
|
||||
break
|
||||
else:
|
||||
s -= breakmodel.gpu_blocks[i]
|
||||
assert sum(breakmodel.gpu_blocks) <= n_layers
|
||||
n_layers -= sum(breakmodel.gpu_blocks)
|
||||
if utils.args.breakmodel_disklayers is not None:
|
||||
assert utils.args.breakmodel_disklayers <= n_layers
|
||||
breakmodel.disk_blocks = utils.args.breakmodel_disklayers
|
||||
n_layers -= utils.args.breakmodel_disklayers
|
||||
except:
|
||||
logger.warning(
|
||||
"--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0."
|
||||
)
|
||||
breakmodel.gpu_blocks = [n_layers]
|
||||
n_layers = 0
|
||||
elif utils.args.breakmodel_layers is not None:
|
||||
breakmodel.gpu_blocks = [
|
||||
n_layers - max(0, min(n_layers, utils.args.breakmodel_layers))
|
||||
]
|
||||
n_layers -= sum(breakmodel.gpu_blocks)
|
||||
elif utils.args.model is not None:
|
||||
elif breakmodel.gpu_blocks == []:
|
||||
logger.info("Breakmodel not specified, assuming GPU 0")
|
||||
breakmodel.gpu_blocks = [n_layers]
|
||||
n_layers = 0
|
||||
|
||||
else:
|
||||
device_count = torch.cuda.device_count()
|
||||
if device_count > 1:
|
||||
print(
|
||||
Colors.CYAN
|
||||
+ "\nPlease select one of your GPUs to be your primary GPU."
|
||||
)
|
||||
print(
|
||||
"VRAM usage in your primary GPU will be higher than for your other ones."
|
||||
)
|
||||
print("It is recommended you make your fastest GPU your primary GPU.")
|
||||
self.breakmodel_device_list(n_layers)
|
||||
while True:
|
||||
primaryselect = input("device ID> ")
|
||||
if (
|
||||
primaryselect.isnumeric()
|
||||
and 0 <= int(primaryselect) < device_count
|
||||
):
|
||||
breakmodel.primary_device = int(primaryselect)
|
||||
break
|
||||
else:
|
||||
print(
|
||||
f"{Colors.RED}Please enter an integer between 0 and {device_count-1}.{Colors.END}"
|
||||
)
|
||||
else:
|
||||
breakmodel.primary_device = 0
|
||||
|
||||
print(
|
||||
Colors.PURPLE
|
||||
+ "\nIf you don't have enough VRAM to run the model on a single GPU"
|
||||
)
|
||||
print(
|
||||
"you can split the model between your CPU and your GPU(s), or between"
|
||||
)
|
||||
print("multiple GPUs if you have more than one.")
|
||||
print("By putting more 'layers' on a GPU or CPU, more computations will be")
|
||||
print(
|
||||
"done on that device and more VRAM or RAM will be required on that device"
|
||||
)
|
||||
print("(roughly proportional to number of layers).")
|
||||
print(
|
||||
"It should be noted that GPUs are orders of magnitude faster than the CPU."
|
||||
)
|
||||
print(
|
||||
f"This model has{Colors.YELLOW} {n_layers} {Colors.PURPLE}layers.{Colors.END}\n"
|
||||
)
|
||||
|
||||
for i in range(device_count):
|
||||
self.breakmodel_device_list(
|
||||
n_layers, primary=breakmodel.primary_device, selected=i
|
||||
)
|
||||
print(
|
||||
f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
|
||||
)
|
||||
while True:
|
||||
layerselect = input("# of layers> ")
|
||||
if (
|
||||
layerselect.isnumeric() or layerselect.strip() == "-1"
|
||||
) and -1 <= int(layerselect) <= n_layers:
|
||||
layerselect = int(layerselect)
|
||||
layerselect = n_layers if layerselect == -1 else layerselect
|
||||
breakmodel.gpu_blocks.append(layerselect)
|
||||
n_layers -= layerselect
|
||||
break
|
||||
else:
|
||||
print(
|
||||
f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
|
||||
)
|
||||
if n_layers == 0:
|
||||
s = n_layers
|
||||
for i in range(len(breakmodel.gpu_blocks)):
|
||||
if breakmodel.gpu_blocks[i] <= -1:
|
||||
breakmodel.gpu_blocks[i] = s
|
||||
break
|
||||
|
||||
if n_layers > 0:
|
||||
self.breakmodel_device_list(
|
||||
n_layers, primary=breakmodel.primary_device, selected=-1
|
||||
)
|
||||
print(
|
||||
f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
|
||||
)
|
||||
while True:
|
||||
layerselect = input("# of layers> ")
|
||||
if (
|
||||
layerselect.isnumeric() or layerselect.strip() == "-1"
|
||||
) and -1 <= int(layerselect) <= n_layers:
|
||||
layerselect = int(layerselect)
|
||||
layerselect = n_layers if layerselect == -1 else layerselect
|
||||
breakmodel.disk_blocks = layerselect
|
||||
n_layers -= layerselect
|
||||
break
|
||||
else:
|
||||
print(
|
||||
f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
|
||||
)
|
||||
else:
|
||||
s -= breakmodel.gpu_blocks[i]
|
||||
assert sum(breakmodel.gpu_blocks) <= n_layers
|
||||
n_layers -= sum(breakmodel.gpu_blocks)
|
||||
if breakmodel.disk_blocks is not None:
|
||||
assert breakmodel.disk_blocks <= n_layers
|
||||
n_layers -= breakmodel.disk_blocks
|
||||
|
||||
logger.init_ok("Final device configuration:", status="Info")
|
||||
self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
|
||||
with open("settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w") as file:
|
||||
file.write("{}\n{}".format(",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks))
|
||||
|
||||
# If all layers are on the same device, use the old GPU generation mode
|
||||
while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
|
||||
breakmodel.gpu_blocks.pop()
|
||||
self.breakmodel = True
|
||||
if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (
|
||||
-1,
|
||||
utils.num_layers(config),
|
||||
):
|
||||
utils.koboldai_vars.breakmodel = False
|
||||
utils.koboldai_vars.usegpu = True
|
||||
logger.debug("All layers on same GPU. Breakmodel disabled")
|
||||
self.breakmodel = False
|
||||
self.usegpu = True
|
||||
utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1
|
||||
return
|
||||
|
||||
@@ -973,6 +882,6 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
import breakmodel
|
||||
|
||||
breakmodel.primary_device = "cpu"
|
||||
utils.koboldai_vars.breakmodel = False
|
||||
utils.koboldai_vars.usegpu = False
|
||||
self.breakmodel = False
|
||||
self.usegpu = False
|
||||
return
|
||||
|
@@ -1,10 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import time, json
|
||||
import torch
|
||||
import requests
|
||||
import numpy as np
|
||||
from typing import List, Optional, Union
|
||||
import os
|
||||
|
||||
import utils
|
||||
from logger import logger
|
||||
@@ -16,25 +17,131 @@ from modeling.inference_model import (
|
||||
ModelCapabilities,
|
||||
)
|
||||
|
||||
model_backend_name = "Horde"
|
||||
|
||||
class HordeException(Exception):
|
||||
"""To be used for errors on server side of the Horde."""
|
||||
|
||||
|
||||
class HordeInferenceModel(InferenceModel):
|
||||
class model_backend(InferenceModel):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.url = "https://horde.koboldai.net"
|
||||
self.key = "0000000000"
|
||||
self.models = self.get_cluster_models()
|
||||
self.model_name = "Horde"
|
||||
self.model = []
|
||||
|
||||
|
||||
# Do not allow API to be served over the API
|
||||
self.capabilties = ModelCapabilities(api_host=False)
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
logger.debug("Horde Models: {}".format(self.models))
|
||||
return model_name == "CLUSTER" or model_name in [x['value'] for x in self.models]
|
||||
|
||||
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
||||
if os.path.exists("settings/api.model_backend.settings") and 'base_url' not in vars(self):
|
||||
with open("settings/horde.model_backend.settings", "r") as f:
|
||||
temp = json.load(f)
|
||||
self.base_url = temp['url']
|
||||
self.key = temp['key']
|
||||
if 'key' in parameters:
|
||||
self.key = parameters['key']
|
||||
if 'url' in parameters:
|
||||
self.url = parameters['url']
|
||||
requested_parameters = []
|
||||
requested_parameters.extend([{
|
||||
"uitype": "text",
|
||||
"unit": "text",
|
||||
"label": "URL",
|
||||
"id": "url",
|
||||
"default": self.url if 'url' not in parameters else parameters['url'],
|
||||
"tooltip": "URL to the horde.",
|
||||
"menu_path": "",
|
||||
"check": {"value": "", 'check': "!="},
|
||||
"refresh_model_inputs": True,
|
||||
"extra_classes": ""
|
||||
},
|
||||
{
|
||||
"uitype": "text",
|
||||
"unit": "text",
|
||||
"label": "Key",
|
||||
"id": "key",
|
||||
"default": self.key if 'key' not in parameters else parameters['key'],
|
||||
"check": {"value": "", 'check': "!="},
|
||||
"tooltip": "User Key to use when connecting to Horde (0000000000 is anonymous).",
|
||||
"menu_path": "",
|
||||
"refresh_model_inputs": True,
|
||||
"extra_classes": ""
|
||||
},
|
||||
{
|
||||
"uitype": "dropdown",
|
||||
"unit": "text",
|
||||
"label": "Model",
|
||||
"id": "model",
|
||||
"default": model_name,
|
||||
"check": {"value": "", 'check': "!="},
|
||||
'multiple': True,
|
||||
"tooltip": "Which model to use when running OpenAI/GooseAI.",
|
||||
"menu_path": "",
|
||||
"refresh_model_inputs": False,
|
||||
"extra_classes": "",
|
||||
'children': self.models,
|
||||
|
||||
}])
|
||||
return requested_parameters
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
self.key = parameters['key'].strip()
|
||||
self.model = parameters['model']
|
||||
self.url = parameters['url']
|
||||
|
||||
def get_cluster_models(self):
|
||||
# Get list of models from public cluster
|
||||
try:
|
||||
req = requests.get(f"{self.url}/api/v2/status/models?type=text")
|
||||
except:
|
||||
logger.init_err("KAI Horde Models", status="Failed")
|
||||
logger.error("Provided KoboldAI Horde URL unreachable")
|
||||
emit('from_server', {'cmd': 'errmsg', 'data': "Provided KoboldAI Horde URL unreachable"})
|
||||
return
|
||||
if not req.ok:
|
||||
# Something went wrong, print the message and quit since we can't initialize an engine
|
||||
logger.init_err("KAI Horde Models", status="Failed")
|
||||
logger.error(req.json())
|
||||
emit('from_server', {'cmd': 'errmsg', 'data': req.json()}, room="UI_1")
|
||||
return
|
||||
|
||||
engines = req.json()
|
||||
try:
|
||||
engines = [{"text": "All", "value": "all"}] + [{"text": en["name"], "value": en["name"]} for en in engines]
|
||||
except:
|
||||
logger.error(engines)
|
||||
raise
|
||||
logger.debug(engines)
|
||||
|
||||
online_model = ""
|
||||
|
||||
logger.init_ok("KAI Horde Models", status="OK")
|
||||
|
||||
return engines
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
tokenizer_name = "gpt2"
|
||||
if len(self.model) > 0:
|
||||
if self.model[0] == "all" and len(self.model) > 1:
|
||||
tokenizer_name = self.model[1]
|
||||
else:
|
||||
tokenizer_name = self.model[0]
|
||||
self.tokenizer = self._get_tokenizer(
|
||||
utils.koboldai_vars.cluster_requested_models[0]
|
||||
if len(utils.koboldai_vars.cluster_requested_models) > 0
|
||||
else "gpt2",
|
||||
tokenizer_name
|
||||
)
|
||||
|
||||
def _save_settings(self):
|
||||
with open("settings/horde.model_backend.settings", "w") as f:
|
||||
json.dump({"key": self.key, "url": self.url}, f, indent="")
|
||||
|
||||
def _raw_generate(
|
||||
self,
|
||||
prompt_tokens: Union[List[int], torch.Tensor],
|
||||
@@ -80,14 +187,14 @@ class HordeInferenceModel(InferenceModel):
|
||||
|
||||
client_agent = "KoboldAI:2.0.0:koboldai.org"
|
||||
cluster_headers = {
|
||||
"apikey": utils.koboldai_vars.horde_api_key,
|
||||
"apikey": self.key,
|
||||
"Client-Agent": client_agent,
|
||||
}
|
||||
|
||||
try:
|
||||
# Create request
|
||||
req = requests.post(
|
||||
f"{utils.koboldai_vars.horde_url}/api/v2/generate/text/async",
|
||||
f"{self.url}/api/v2/generate/text/async",
|
||||
json=cluster_metadata,
|
||||
headers=cluster_headers,
|
||||
)
|
||||
@@ -125,7 +232,7 @@ class HordeInferenceModel(InferenceModel):
|
||||
while not finished:
|
||||
try:
|
||||
req = requests.get(
|
||||
f"{utils.koboldai_vars.horde_url}/api/v2/generate/text/status/{request_id}",
|
||||
f"{self.url}/api/v2/generate/text/status/{request_id}",
|
||||
headers=cluster_agent_headers,
|
||||
)
|
||||
except requests.exceptions.ConnectionError:
|
@@ -1,106 +0,0 @@
|
||||
import torch
|
||||
import requests
|
||||
import numpy as np
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import utils
|
||||
from logger import logger
|
||||
from modeling.inference_model import (
|
||||
GenerationResult,
|
||||
GenerationSettings,
|
||||
InferenceModel,
|
||||
)
|
||||
|
||||
|
||||
class OpenAIAPIError(Exception):
|
||||
def __init__(self, error_type: str, error_message) -> None:
|
||||
super().__init__(f"{error_type}: {error_message}")
|
||||
|
||||
|
||||
class OpenAIAPIInferenceModel(InferenceModel):
|
||||
"""InferenceModel for interfacing with OpenAI's generation API."""
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
self.tokenizer = self._get_tokenizer("gpt2")
|
||||
|
||||
def _raw_generate(
|
||||
self,
|
||||
prompt_tokens: Union[List[int], torch.Tensor],
|
||||
max_new: int,
|
||||
gen_settings: GenerationSettings,
|
||||
single_line: bool = False,
|
||||
batch_count: int = 1,
|
||||
seed: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> GenerationResult:
|
||||
|
||||
if seed is not None:
|
||||
logger.warning(
|
||||
"Seed is unsupported on the OpenAIAPIInferenceModel. Seed will be ignored."
|
||||
)
|
||||
|
||||
decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens))
|
||||
|
||||
# Store context in memory to use it for comparison with generated content
|
||||
utils.koboldai_vars.lastctx = decoded_prompt
|
||||
|
||||
# Build request JSON data
|
||||
# GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround
|
||||
# as the koboldai_vars.model will always be OAI
|
||||
if "GooseAI" in utils.koboldai_vars.configname:
|
||||
reqdata = {
|
||||
"prompt": decoded_prompt,
|
||||
"max_tokens": max_new,
|
||||
"temperature": gen_settings.temp,
|
||||
"top_a": gen_settings.top_a,
|
||||
"top_p": gen_settings.top_p,
|
||||
"top_k": gen_settings.top_k,
|
||||
"tfs": gen_settings.tfs,
|
||||
"typical_p": gen_settings.typical,
|
||||
"repetition_penalty": gen_settings.rep_pen,
|
||||
"repetition_penalty_slope": gen_settings.rep_pen_slope,
|
||||
"repetition_penalty_range": gen_settings.rep_pen_range,
|
||||
"n": batch_count,
|
||||
# TODO: Implement streaming
|
||||
"stream": False,
|
||||
}
|
||||
else:
|
||||
reqdata = {
|
||||
"prompt": decoded_prompt,
|
||||
"max_tokens": max_new,
|
||||
"temperature": gen_settings.temp,
|
||||
"top_p": gen_settings.top_p,
|
||||
"frequency_penalty": gen_settings.rep_pen,
|
||||
"n": batch_count,
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
req = requests.post(
|
||||
utils.koboldai_vars.oaiurl,
|
||||
json=reqdata,
|
||||
headers={
|
||||
"Authorization": "Bearer " + utils.koboldai_vars.oaiapikey,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
|
||||
j = req.json()
|
||||
|
||||
if not req.ok:
|
||||
# Send error message to web client
|
||||
if "error" in j:
|
||||
error_type = j["error"]["type"]
|
||||
error_message = j["error"]["message"]
|
||||
else:
|
||||
error_type = "Unknown"
|
||||
error_message = "Unknown"
|
||||
raise OpenAIAPIError(error_type, error_message)
|
||||
|
||||
outputs = [out["text"] for out in j["choices"]]
|
||||
return GenerationResult(
|
||||
model=self,
|
||||
out_batches=np.array([self.tokenizer.encode(x) for x in outputs]),
|
||||
prompt=prompt_tokens,
|
||||
is_whole_generation=True,
|
||||
single_line=single_line,
|
||||
)
|
33
modeling/inference_models/openai/class.py
Normal file
33
modeling/inference_models/openai/class.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import torch
|
||||
import requests
|
||||
import numpy as np
|
||||
from typing import List, Optional, Union
|
||||
import os
|
||||
|
||||
import utils
|
||||
from logger import logger
|
||||
from modeling.inference_model import (
|
||||
GenerationResult,
|
||||
GenerationSettings,
|
||||
InferenceModel,
|
||||
)
|
||||
|
||||
from modeling.inference_models.openai_gooseai import model_backend as openai_gooseai_model_backend
|
||||
|
||||
model_backend_name = "OpenAI"
|
||||
|
||||
class OpenAIAPIError(Exception):
|
||||
def __init__(self, error_type: str, error_message) -> None:
|
||||
super().__init__(f"{error_type}: {error_message}")
|
||||
|
||||
|
||||
class model_backend(openai_gooseai_model_backend):
|
||||
"""InferenceModel for interfacing with OpenAI's generation API."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.url = "https://api.openai.com/v1/engines"
|
||||
self.source = "OpenAI"
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
return model_name == "OAI"
|
199
modeling/inference_models/openai_gooseai.py
Normal file
199
modeling/inference_models/openai_gooseai.py
Normal file
@@ -0,0 +1,199 @@
|
||||
import torch
|
||||
import requests,json
|
||||
import numpy as np
|
||||
from typing import List, Optional, Union
|
||||
import os
|
||||
|
||||
import utils
|
||||
from logger import logger
|
||||
from modeling.inference_model import (
|
||||
GenerationResult,
|
||||
GenerationSettings,
|
||||
InferenceModel,
|
||||
)
|
||||
|
||||
|
||||
|
||||
class OpenAIAPIError(Exception):
|
||||
def __init__(self, error_type: str, error_message) -> None:
|
||||
super().__init__(f"{error_type}: {error_message}")
|
||||
|
||||
|
||||
class model_backend(InferenceModel):
|
||||
"""InferenceModel for interfacing with OpenAI's generation API."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.key = ""
|
||||
self.url = "https://api.goose.ai/v1/engines"
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
return model_name == "OAI" or model_name == "GooseAI"
|
||||
|
||||
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
||||
if os.path.exists("settings/{}.model_backend.settings".format(self.source)) and 'colaburl' not in vars(self):
|
||||
with open("settings/{}.model_backend.settings".format(self.source), "r") as f:
|
||||
try:
|
||||
self.key = json.load(f)['key']
|
||||
except:
|
||||
pass
|
||||
if 'key' in parameters:
|
||||
self.key = parameters['key']
|
||||
self.source = model_name
|
||||
requested_parameters = []
|
||||
requested_parameters.extend([{
|
||||
"uitype": "text",
|
||||
"unit": "text",
|
||||
"label": "Key",
|
||||
"id": "key",
|
||||
"default": self.key,
|
||||
"check": {"value": "", 'check': "!="},
|
||||
"tooltip": "User Key to use when connecting to OpenAI/GooseAI.",
|
||||
"menu_path": "",
|
||||
"refresh_model_inputs": True,
|
||||
"extra_classes": ""
|
||||
},
|
||||
{
|
||||
"uitype": "dropdown",
|
||||
"unit": "text",
|
||||
"label": "Model",
|
||||
"id": "model",
|
||||
"default": "",
|
||||
"check": {"value": "", 'check': "!="},
|
||||
"tooltip": "Which model to use when running OpenAI/GooseAI.",
|
||||
"menu_path": "",
|
||||
"refresh_model_inputs": False,
|
||||
"extra_classes": "",
|
||||
'children': self.get_oai_models(),
|
||||
|
||||
}])
|
||||
return requested_parameters
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
self.key = parameters['key'].strip()
|
||||
self.model_name = parameters['model']
|
||||
|
||||
def get_oai_models(self):
|
||||
if self.key == "":
|
||||
return []
|
||||
|
||||
|
||||
# Get list of models from OAI
|
||||
logger.init("OAI Engines", status="Retrieving")
|
||||
req = requests.get(
|
||||
self.url,
|
||||
headers = {
|
||||
'Authorization': 'Bearer '+self.key
|
||||
}
|
||||
)
|
||||
if(req.status_code == 200):
|
||||
r = req.json()
|
||||
engines = r["data"]
|
||||
try:
|
||||
engines = [{"value": en["id"], "text": "{} ({})".format(en['id'], "Ready" if en["ready"] == True else "Not Ready")} for en in engines]
|
||||
except:
|
||||
logger.error(engines)
|
||||
raise
|
||||
|
||||
online_model = ""
|
||||
|
||||
|
||||
logger.init_ok("OAI Engines", status="OK")
|
||||
logger.debug("OAI Engines: {}".format(engines))
|
||||
return engines
|
||||
else:
|
||||
# Something went wrong, print the message and quit since we can't initialize an engine
|
||||
logger.init_err("OAI Engines", status="Failed")
|
||||
logger.error(req.json())
|
||||
emit('from_server', {'cmd': 'errmsg', 'data': req.json()})
|
||||
return []
|
||||
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
self.tokenizer = self._get_tokenizer("gpt2")
|
||||
|
||||
def _save_settings(self):
|
||||
with open("settings/{}.model_backend.settings".format(self.source), "w") as f:
|
||||
json.dump({"key": self.key}, f, indent="")
|
||||
|
||||
def _raw_generate(
|
||||
self,
|
||||
prompt_tokens: Union[List[int], torch.Tensor],
|
||||
max_new: int,
|
||||
gen_settings: GenerationSettings,
|
||||
single_line: bool = False,
|
||||
batch_count: int = 1,
|
||||
seed: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> GenerationResult:
|
||||
|
||||
if seed is not None:
|
||||
logger.warning(
|
||||
"Seed is unsupported on the OpenAIAPIInferenceModel. Seed will be ignored."
|
||||
)
|
||||
|
||||
decoded_prompt = utils.decodenewlines(self.tokenizer.decode(prompt_tokens))
|
||||
|
||||
# Store context in memory to use it for comparison with generated content
|
||||
utils.koboldai_vars.lastctx = decoded_prompt
|
||||
|
||||
# Build request JSON data
|
||||
# GooseAI is a subntype of OAI. So to check if it's this type, we check the configname as a workaround
|
||||
# as the koboldai_vars.model will always be OAI
|
||||
if self.source == "GooseAI":
|
||||
reqdata = {
|
||||
"prompt": decoded_prompt,
|
||||
"max_tokens": max_new,
|
||||
"temperature": gen_settings.temp,
|
||||
"top_a": gen_settings.top_a,
|
||||
"top_p": gen_settings.top_p,
|
||||
"top_k": gen_settings.top_k,
|
||||
"tfs": gen_settings.tfs,
|
||||
"typical_p": gen_settings.typical,
|
||||
"repetition_penalty": gen_settings.rep_pen,
|
||||
"repetition_penalty_slope": gen_settings.rep_pen_slope,
|
||||
"repetition_penalty_range": gen_settings.rep_pen_range,
|
||||
"n": batch_count,
|
||||
# TODO: Implement streaming
|
||||
"stream": False,
|
||||
}
|
||||
else:
|
||||
reqdata = {
|
||||
"prompt": decoded_prompt,
|
||||
"max_tokens": max_new,
|
||||
"temperature": gen_settings.temp,
|
||||
"top_p": gen_settings.top_p,
|
||||
"frequency_penalty": gen_settings.rep_pen,
|
||||
"n": batch_count,
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
req = requests.post(
|
||||
"{}/{}/completions".format(self.url, self.model_name),
|
||||
json=reqdata,
|
||||
headers={
|
||||
"Authorization": "Bearer " + self.key,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
|
||||
j = req.json()
|
||||
|
||||
if not req.ok:
|
||||
# Send error message to web client
|
||||
if "error" in j:
|
||||
error_type = j["error"]["type"]
|
||||
error_message = j["error"]["message"]
|
||||
else:
|
||||
error_type = "Unknown"
|
||||
error_message = "Unknown"
|
||||
raise OpenAIAPIError(error_type, error_message)
|
||||
|
||||
outputs = [out["text"] for out in j["choices"]]
|
||||
return GenerationResult(
|
||||
model=self,
|
||||
out_batches=np.array([self.tokenizer.encode(x) for x in outputs]),
|
||||
prompt=prompt_tokens,
|
||||
is_whole_generation=True,
|
||||
single_line=single_line,
|
||||
)
|
78
modeling/inference_models/readonly/class.py
Normal file
78
modeling/inference_models/readonly/class.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import torch
|
||||
import requests
|
||||
import numpy as np
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import utils
|
||||
from logger import logger
|
||||
from modeling.inference_model import (
|
||||
GenerationResult,
|
||||
GenerationSettings,
|
||||
InferenceModel,
|
||||
ModelCapabilities,
|
||||
)
|
||||
|
||||
model_backend_name = "Read Only"
|
||||
|
||||
class BasicAPIException(Exception):
|
||||
"""To be used for errors when using the Basic API as an interface."""
|
||||
|
||||
|
||||
class model_backend(InferenceModel):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
# Do not allow API to be served over the API
|
||||
self.capabilties = ModelCapabilities(api_host=False)
|
||||
self.tokenizer = self._tokenizer()
|
||||
self.model = None
|
||||
self.model_name = "Read Only"
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
return model_name == "ReadOnly"
|
||||
|
||||
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
||||
requested_parameters = []
|
||||
return requested_parameters
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
return
|
||||
|
||||
def unload(self):
|
||||
utils.koboldai_vars.noai = False
|
||||
|
||||
def _initialize_model(self):
|
||||
return
|
||||
|
||||
class _tokenizer():
|
||||
def __init__(self):
|
||||
self._koboldai_header = []
|
||||
def decode(self, _input):
|
||||
return ""
|
||||
def encode(self, input_text):
|
||||
return []
|
||||
|
||||
def _load(self, save_model: bool = False, initial_load: bool = False) -> None:
|
||||
self.tokenizer = self.tokenizer
|
||||
self.model = None
|
||||
utils.koboldai_vars.noai = True
|
||||
|
||||
def _raw_generate(
|
||||
self,
|
||||
prompt_tokens: Union[List[int], torch.Tensor],
|
||||
max_new: int,
|
||||
gen_settings: GenerationSettings,
|
||||
single_line: bool = False,
|
||||
batch_count: int = 1,
|
||||
seed: Optional[int] = None,
|
||||
**kwargs,
|
||||
):
|
||||
return GenerationResult(
|
||||
model=self,
|
||||
out_batches=np.array([]),
|
||||
prompt=prompt_tokens,
|
||||
is_whole_generation=True,
|
||||
single_line=single_line,
|
||||
)
|
@@ -1,237 +0,0 @@
|
||||
from __future__ import annotations
|
||||
import os
|
||||
|
||||
|
||||
import time
|
||||
from typing import Dict, List, Optional, Union
|
||||
import numpy as np
|
||||
import requests
|
||||
from tokenizers import Tokenizer
|
||||
from tqdm import tqdm
|
||||
from huggingface_hub import hf_hub_url
|
||||
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
# Must be defined before import
|
||||
os.environ["RWKV_JIT_ON"] = "1"
|
||||
# TODO: Include compiled kernel
|
||||
os.environ["RWKV_CUDA_ON"] = "1"
|
||||
from rwkv.model import RWKV
|
||||
|
||||
import utils
|
||||
from logger import logger
|
||||
|
||||
from modeling import warpers
|
||||
from modeling.warpers import Warper
|
||||
from modeling.stoppers import Stoppers
|
||||
from modeling.post_token_hooks import PostTokenHooks
|
||||
from modeling.tokenizer import GenericTokenizer
|
||||
from modeling.inference_model import (
|
||||
GenerationResult,
|
||||
GenerationSettings,
|
||||
InferenceModel,
|
||||
ModelCapabilities,
|
||||
)
|
||||
|
||||
TOKENIZER_URL = (
|
||||
"https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/20B_tokenizer.json"
|
||||
)
|
||||
TOKENIZER_PATH = "models/rwkv/20b_tokenizer.json"
|
||||
|
||||
REPO_OWNER = "BlinkDL"
|
||||
MODEL_FILES = {
|
||||
"rwkv-4-pile-14b": "RWKV-4-Pile-14B-20230213-8019.pth",
|
||||
# NOTE: Still in progress(?)
|
||||
"rwkv-4-pile-14b:ctx4096": "RWKV-4-Pile-14B-20230228-ctx4096-test663.pth",
|
||||
"rwkv-4-pile-7b": "RWKV-4-Pile-7B-20221115-8047.pth",
|
||||
"rwkv-4-pile-7b:ctx4096": "RWKV-4-Pile-7B-20230109-ctx4096.pth",
|
||||
"rwkv-4-pile-3b": "RWKV-4-Pile-3B-20221008-8023.pth",
|
||||
"rwkv-4-pile-3b:ctx4096": "RWKV-4-Pile-3B-20221110-ctx4096.pth",
|
||||
"rwkv-4-pile-1b5": "RWKV-4-Pile-1B5-20220903-8040.pth",
|
||||
"rwkv-4-pile-1b5:ctx4096": "RWKV-4-Pile-1B5-20220929-ctx4096.pth",
|
||||
"rwkv-4-pile-430m": "RWKV-4-Pile-430M-20220808-8066.pth",
|
||||
"rwkv-4-pile-169m": "RWKV-4-Pile-169M-20220807-8023.pth",
|
||||
}
|
||||
|
||||
|
||||
class RWKVInferenceModel(InferenceModel):
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.model_name = model_name
|
||||
|
||||
self.post_token_hooks = [
|
||||
PostTokenHooks.stream_tokens,
|
||||
]
|
||||
|
||||
self.stopper_hooks = [
|
||||
Stoppers.core_stopper,
|
||||
Stoppers.dynamic_wi_scanner,
|
||||
Stoppers.singleline_stopper,
|
||||
Stoppers.chat_mode_stopper,
|
||||
Stoppers.stop_sequence_stopper,
|
||||
]
|
||||
|
||||
self.capabilties = ModelCapabilities(
|
||||
embedding_manipulation=False,
|
||||
post_token_hooks=True,
|
||||
stopper_hooks=True,
|
||||
post_token_probs=True,
|
||||
)
|
||||
self._old_stopping_criteria = None
|
||||
|
||||
def _ensure_directory_structure(self) -> None:
|
||||
for path in ["models/rwkv", "models/rwkv/models"]:
|
||||
try:
|
||||
os.mkdir(path)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
def _get_tokenizer(self) -> GenericTokenizer:
|
||||
if not os.path.exists(TOKENIZER_PATH):
|
||||
logger.info("RWKV tokenizer not found, downloading...")
|
||||
|
||||
r = requests.get(TOKENIZER_URL)
|
||||
with open(TOKENIZER_PATH, "wb") as file:
|
||||
file.write(r.content)
|
||||
|
||||
return GenericTokenizer(Tokenizer.from_file(TOKENIZER_PATH))
|
||||
|
||||
def _download_model(self, model_path: str, model_class: str) -> None:
|
||||
logger.info(f"{self.model_name} not found, downloading...")
|
||||
|
||||
url = hf_hub_url(
|
||||
repo_id=f"{REPO_OWNER}/{model_class}",
|
||||
filename=MODEL_FILES[self.model_name],
|
||||
)
|
||||
|
||||
# TODO: Use aria2
|
||||
# https://stackoverflow.com/a/57030446
|
||||
with requests.get(url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
bar = tqdm(
|
||||
desc="Downloading RWKV Model",
|
||||
unit="B",
|
||||
unit_scale=True,
|
||||
total=int(r.headers["Content-Length"]),
|
||||
)
|
||||
with open(model_path, "wb") as file:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
if not chunk:
|
||||
continue
|
||||
file.write(chunk)
|
||||
bar.update(len(chunk))
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
self._ensure_directory_structure()
|
||||
self.tokenizer = self._get_tokenizer()
|
||||
|
||||
# Parse model name
|
||||
model_class, _, special = self.model_name.partition(":")
|
||||
special = special or None
|
||||
|
||||
model_dir = os.path.join("models", "rwkv", "models", model_class)
|
||||
if not os.path.exists(model_dir):
|
||||
os.mkdir(model_dir)
|
||||
|
||||
# Download model if we need to
|
||||
model_path = os.path.join(model_dir, MODEL_FILES[self.model_name])
|
||||
if not os.path.exists(model_path):
|
||||
self._download_model(model_path, model_class)
|
||||
|
||||
# Now we load!
|
||||
|
||||
# TODO: Breakmodel to strat
|
||||
self.model = RWKV(model=model_path, strategy="cuda:0 fp16")
|
||||
|
||||
def _apply_warpers(
|
||||
self, scores: torch.Tensor, input_ids: torch.Tensor
|
||||
) -> torch.Tensor:
|
||||
warpers.update_settings()
|
||||
for sid in utils.koboldai_vars.sampler_order:
|
||||
warper = Warper.from_id(sid)
|
||||
|
||||
if not warper.value_is_valid():
|
||||
continue
|
||||
|
||||
if warper == warpers.RepetitionPenalty:
|
||||
# Rep pen needs more data than other samplers
|
||||
scores = warper.torch(scores, input_ids=input_ids)
|
||||
else:
|
||||
scores = warper.torch(scores)
|
||||
return scores
|
||||
|
||||
def _sample_token(self, logits: torch.Tensor, input_ids: torch.Tensor) -> int:
|
||||
probs = F.softmax(logits.float(), dim=-1)
|
||||
|
||||
if probs.device == torch.device("cpu"):
|
||||
probs = probs.numpy()
|
||||
sorted_ids = np.argsort(probs)
|
||||
sorted_probs = probs[sorted_ids][::-1]
|
||||
|
||||
probs = self._apply_warpers(probs[None, :], input_ids)
|
||||
|
||||
# TODO: is this right?
|
||||
probs[probs == -torch.inf] = 0.0
|
||||
|
||||
probs = probs / np.sum(probs)
|
||||
out = np.random.choice(a=len(probs), p=probs)
|
||||
return int(out)
|
||||
else:
|
||||
sorted_ids = torch.argsort(probs)
|
||||
sorted_probs = probs[sorted_ids]
|
||||
sorted_probs = torch.flip(sorted_probs, dims=(0,))
|
||||
|
||||
probs = self._apply_warpers(probs[None, :], input_ids)
|
||||
|
||||
# TODO: is this right?
|
||||
probs[probs == -torch.inf] = 0.0
|
||||
|
||||
out = torch.multinomial(probs, num_samples=1)[0]
|
||||
return int(out)
|
||||
|
||||
def _raw_generate(
|
||||
self,
|
||||
prompt_tokens: Union[List[int], torch.Tensor],
|
||||
max_new: int,
|
||||
gen_settings: GenerationSettings,
|
||||
single_line: bool = False,
|
||||
batch_count: int = 1,
|
||||
seed: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> GenerationResult:
|
||||
if seed is not None:
|
||||
torch.manual_seed(seed)
|
||||
|
||||
aux_device = utils.get_auxilary_device()
|
||||
context = torch.tensor(prompt_tokens)[None, :].to(aux_device)
|
||||
out = []
|
||||
|
||||
start_time = time.time()
|
||||
with torch.no_grad():
|
||||
logits, state = self.model.forward(prompt_tokens, None)
|
||||
last_token = prompt_tokens[-1]
|
||||
|
||||
for _ in range(max_new):
|
||||
|
||||
logits, state = self.model.forward([last_token], state)
|
||||
last_token = self._sample_token(logits, context)
|
||||
out.append(last_token)
|
||||
add = torch.tensor([[last_token]]).to(aux_device)
|
||||
context = torch.cat((context, add), dim=-1)
|
||||
self._post_token_gen(context)
|
||||
|
||||
logger.debug(
|
||||
"torch_raw_generate: run generator {}s".format(time.time() - start_time)
|
||||
)
|
||||
|
||||
return GenerationResult(
|
||||
self,
|
||||
out_batches=torch.tensor([out]),
|
||||
prompt=prompt_tokens,
|
||||
is_whole_generation=False,
|
||||
output_includes_prompt=True,
|
||||
)
|
Reference in New Issue
Block a user