Merge ebolam's model-plugins branch

This commit is contained in:
0cc4m
2023-05-28 09:26:13 +02:00
33 changed files with 3503 additions and 1631 deletions

View File

@@ -1,25 +1,230 @@
import os
import os, sys
from typing import Optional
<<<<<<< HEAD
from hf_bleeding_edge import AutoConfig
=======
from transformers import AutoConfig
import warnings
>>>>>>> ebolam/Model_Plugins
import utils
import json
import koboldai_settings
from logger import logger
from modeling.inference_model import InferenceModel
import torch
import gc
class HFInferenceModel(InferenceModel):
def __init__(self, model_name: str) -> None:
def __init__(self) -> None:
super().__init__()
self.model_config = None
self.model_name = model_name
#self.model_name = model_name
self.model = None
self.tokenizer = None
self.badwordsids = koboldai_settings.badwordsids_default
self.usegpu = False
def is_valid(self, model_name, model_path, menu_path):
try:
if model_path is not None and os.path.exists(model_path):
self.model_config = AutoConfig.from_pretrained(model_path)
elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
else:
self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
return True
except:
return False
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
requested_parameters = []
if not self.hf_torch:
return []
if model_name == 'customhuggingface':
requested_parameters.append({
"uitype": "text",
"unit": "text",
"label": "Huggingface Model Name",
"id": "custom_model_name",
"default": parameters["custom_model_name"] if "custom_model_name" in parameters and parameters["custom_model_name"] != "" else "",
"check": {"value": "", 'check': "!="},
"tooltip": "Model name from https://huggingface.co/",
"menu_path": "",
"refresh_model_inputs": True,
"extra_classes": ""
})
if model_name != 'customhuggingface' or "custom_model_name" in parameters:
model_name = parameters["custom_model_name"] if "custom_model_name" in parameters and parameters["custom_model_name"] != "" else model_name
if model_path is not None and os.path.exists(model_path):
self.model_config = AutoConfig.from_pretrained(model_path)
elif(os.path.exists("models/{}".format(model_name.replace('/', '_')))):
self.model_config = AutoConfig.from_pretrained("models/{}".format(model_name.replace('/', '_')), revision=utils.koboldai_vars.revision, cache_dir="cache")
else:
self.model_config = AutoConfig.from_pretrained(model_name, revision=utils.koboldai_vars.revision, cache_dir="cache")
layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
layer_count = None if hasattr(self, "get_model_type") and self.get_model_type() == "gpt2" else layer_count #Skip layers if we're a GPT2 model as it doesn't support breakmodel
if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
temp = json.load(f)
break_values = temp['layers'] if 'layers' in temp else [layer_count]
disk_blocks = temp['disk_layers'] if 'disk_layers' in temp else 0
else:
break_values = [layer_count]
disk_blocks = 0
break_values = [int(x) for x in break_values if x != '' and x is not None]
gpu_count = torch.cuda.device_count()
break_values += [0] * (gpu_count - len(break_values))
if disk_blocks is not None:
break_values += [int(disk_blocks)]
requested_parameters.append({
"uitype": "Valid Display",
"unit": "text",
"label": "Current Allocated Layers: %1/{}".format(layer_count), #%1 will be the validation value
"id": "valid_layers",
"max": layer_count,
"step": 1,
"check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
"menu_path": "Layers",
"extra_classes": "",
"refresh_model_inputs": False
})
for i in range(gpu_count):
requested_parameters.append({
"uitype": "slider",
"unit": "int",
"label": "{} Layers".format(torch.cuda.get_device_name(i)),
"id": "{}_Layers".format(i),
"min": 0,
"max": layer_count,
"step": 1,
"check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
"check_message": "The sum of assigned layers must equal {}".format(layer_count),
"default": break_values[i],
"tooltip": "The number of layers to put on {}.".format(torch.cuda.get_device_name(i)),
"menu_path": "Layers",
"extra_classes": "",
"refresh_model_inputs": False
})
requested_parameters.append({
"uitype": "slider",
"unit": "int",
"label": "CPU Layers",
"id": "CPU_Layers",
"min": 0,
"max": layer_count,
"step": 1,
"check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
"check_message": "The sum of assigned layers must equal {}".format(layer_count),
"default": layer_count - sum(break_values),
"tooltip": "The number of layers to put on the CPU. This will use your system RAM. It will also do inference partially on CPU. Use if you must.",
"menu_path": "Layers",
"extra_classes": "",
"refresh_model_inputs": False
})
if disk_blocks is not None:
requested_parameters.append({
"uitype": "slider",
"unit": "int",
"label": "Disk Layers",
"id": "Disk_Layers",
"min": 0,
"max": layer_count,
"step": 1,
"check": {"sum": ["{}_Layers".format(i) for i in range(gpu_count)]+['CPU_Layers']+(['Disk_Layers'] if disk_blocks is not None else []), "value": layer_count, 'check': "="},
"check_message": "The sum of assigned layers must equal {}".format(layer_count),
"default": disk_blocks,
"tooltip": "The number of layers to put on the disk. This will use your hard drive. The is VERY slow in comparison to GPU or CPU. Use as a last resort.",
"menu_path": "Layers",
"extra_classes": "",
"refresh_model_inputs": False
})
else:
requested_parameters.append({
"uitype": "toggle",
"unit": "bool",
"label": "Use GPU",
"id": "use_gpu",
"default": True,
"tooltip": "Whether or not to use the GPU",
"menu_path": "Layers",
"extra_classes": "",
"refresh_model_inputs": False
})
return requested_parameters
def set_input_parameters(self, parameters):
if self.hf_torch and hasattr(self, "get_model_type") and self.get_model_type() != "gpt2":
import breakmodel
layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None
if layer_count is not None and layer_count >= 0 and not self.nobreakmodel:
gpu_count = torch.cuda.device_count()
layers = []
for i in range(gpu_count):
if isinstance(parameters["{}_Layers".format(i)], str) and parameters["{}_Layers".format(i)].isnumeric():
layers.append(int(parameters["{}_Layers".format(i)]))
elif isinstance(parameters["{}_Layers".format(i)], str):
layers.append(None)
else:
layers.append(parameters["{}_Layers".format(i)])
self.cpu_layers = int(parameters['CPU_Layers']) if 'CPU_Layers' in parameters else None
if isinstance(self.cpu_layers, str):
self.cpu_layers = int(self.cpu_layers) if self.cpu_layers.isnumeric() else 0
self.layers = layers
self.disk_layers = parameters['Disk_Layers'] if 'Disk_Layers' in parameters else 0
if isinstance(self.disk_layers, str):
self.disk_layers = int(self.disk_layers) if self.disk_layers.isnumeric() else 0
breakmodel.gpu_blocks = layers
breakmodel.disk_blocks = self.disk_layers
self.usegpu = self.cpu_layers == 0 and breakmodel.disk_blocks == 0 and sum(self.layers)-self.layers[0] == 0
self.model_type = self.get_model_type()
self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel
self.lazy_load = True
logger.debug("Model type: {}".format(self.model_type))
else:
logger.debug("Disabling breakmodel and lazyload")
self.usegpu = parameters['use_gpu'] if 'use_gpu' in parameters else None
self.breakmodel = False
self.lazy_load = False
logger.info(parameters)
self.model_name = parameters['custom_model_name'] if 'custom_model_name' in parameters else parameters['id']
self.path = parameters['path'] if 'path' in parameters else None
def unload(self):
if hasattr(self, 'model'):
self.model = None
if hasattr(self, 'tokenizer'):
self.tokenizer = None
if hasattr(self, 'model_config'):
self.model_config = None
with torch.no_grad():
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="torch.distributed.reduce_op is deprecated")
for tensor in gc.get_objects():
try:
if torch.is_tensor(tensor):
tensor.set_(torch.tensor((), device=tensor.device, dtype=tensor.dtype))
except:
pass
gc.collect()
try:
with torch.no_grad():
torch.cuda.empty_cache()
except:
pass
def _post_load(self) -> None:
self.badwordsids = koboldai_settings.badwordsids_default
self.model_type = str(self.model_config.model_type)
# These are model specific tokenizer overrides if a model has bad defaults
if utils.koboldai_vars.model_type == "llama":
if self.model_type == "llama":
# Note: self.tokenizer is a GenericTokenizer, and self.tokenizer.tokenizer is the actual LlamaTokenizer
self.tokenizer.add_bos_token = False
@@ -59,7 +264,7 @@ class HFInferenceModel(InferenceModel):
token_ids = [first]
elif len(token_ids) > 0:
first = int(token_ids[0])
elif token_ids:
elif token_ids is not None and len(token_ids) > 0:
first = token_ids[0]
result = original_decode(self, token_ids, *args, **kwargs)
if first is not None and first in has_prefix_space:
@@ -103,32 +308,32 @@ class HFInferenceModel(InferenceModel):
return result
object.__setattr__(self.tokenizer, '__call__', call_wrapper.__get__(self.tokenizer))
elif utils.koboldai_vars.model_type == "opt":
elif self.model_type == "opt":
self.tokenizer._koboldai_header = self.tokenizer.encode("")
self.tokenizer.add_bos_token = False
self.tokenizer.add_prefix_space = False
# Change newline behavior to match model quirks
if utils.koboldai_vars.model_type == "xglm":
if self.model_type == "xglm":
# Default to </s> newline mode if using XGLM
utils.koboldai_vars.newlinemode = "s"
elif utils.koboldai_vars.model_type in ["opt", "bloom"]:
elif self.model_type in ["opt", "bloom"]:
# Handle </s> but don't convert newlines if using Fairseq models that have newlines trained in them
utils.koboldai_vars.newlinemode = "ns"
# Clean up tokens that cause issues
if (
utils.koboldai_vars.badwordsids == koboldai_settings.badwordsids_default
and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj")
self.badwordsids == koboldai_settings.badwordsids_default
and self.model_type not in ("gpt2", "gpt_neo", "gptj")
):
utils.koboldai_vars.badwordsids = [
self.badwordsids = [
[v]
for k, v in self.tokenizer.get_vocab().items()
if any(c in str(k) for c in "[]")
]
if utils.koboldai_vars.newlinemode == "n":
utils.koboldai_vars.badwordsids.append([self.tokenizer.eos_token_id])
self.badwordsids.append([self.tokenizer.eos_token_id])
return super()._post_load()
@@ -139,9 +344,12 @@ class HFInferenceModel(InferenceModel):
Returns a string of the model's path locally, or None if it is not downloaded.
If ignore_existance is true, it will always return a path.
"""
if self.path is not None:
if os.path.exists(self.path):
return self.path
if self.model_name in ["NeoCustom", "GPT2Custom", "TPUMeshTransformerGPTJ", "TPUMeshTransformerGPTNeoX"]:
model_path = utils.koboldai_vars.custmodpth
model_path = self.path
assert model_path
# Path can be absolute or relative to models directory
@@ -158,7 +366,7 @@ class HFInferenceModel(InferenceModel):
return model_path
basename = utils.koboldai_vars.model.replace("/", "_")
basename = self.model_name.replace("/", "_")
if legacy:
ret = basename
else:
@@ -176,24 +384,25 @@ class HFInferenceModel(InferenceModel):
revision=utils.koboldai_vars.revision,
cache_dir="cache",
)
utils.koboldai_vars.model_type = self.model_config.model_type
self.model_type = self.model_config.model_type
if "gptq_bits" in dir(self.model_config):
utils.koboldai_vars.gptq_model = True
utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
utils.koboldai_vars.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
utils.koboldai_vars.gptq_file = None
self.gptq_model = True
self.gptq_bits = self.model_config.gptq_bits
self.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
self.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
self.gptq_file = None
else:
utils.koboldai_vars.gptq_model = False
self.gptq_model = False
except ValueError:
utils.koboldai_vars.model_type = {
self.model_type = {
"NeoCustom": "gpt_neo",
"GPT2Custom": "gpt2",
}.get(utils.koboldai_vars.model)
}.get(self.model)
if not utils.koboldai_vars.model_type:
if not self.model_type:
logger.warning(
"No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)"
)
utils.koboldai_vars.model_type = "gpt_neo"
self.model_type = "gpt_neo"