mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
348 lines
16 KiB
Python
348 lines
16 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
import json
|
|
import torch
|
|
import shutil
|
|
from typing import Union
|
|
|
|
from transformers import GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
|
|
try:
|
|
from hf_bleeding_edge import AutoModelForCausalLM
|
|
except ImportError:
|
|
from transformers import AutoModelForCausalLM
|
|
|
|
from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
|
|
|
|
import utils
|
|
import modeling.lazy_loader as lazy_loader
|
|
import koboldai_settings
|
|
import importlib
|
|
from logger import logger
|
|
|
|
|
|
from modeling.inference_models.hf_torch import HFTorchInferenceModel
|
|
|
|
model_backend_name = "Huggingface"
|
|
model_backend_type = "Huggingface" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)
|
|
|
|
class model_backend(HFTorchInferenceModel):
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
self.quantization = False
|
|
|
|
def is_valid(self, model_name, model_path, menu_path):
|
|
base_is_valid = super().is_valid(model_name, model_path, menu_path)
|
|
path = False
|
|
gen_path = "models/{}".format(model_name.replace('/', '_'))
|
|
if model_path is not None and os.path.exists(model_path):
|
|
path = model_path
|
|
elif os.path.exists(gen_path):
|
|
path = gen_path
|
|
|
|
fnames = [WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME]
|
|
|
|
return base_is_valid and any(os.path.exists(os.path.join(path, fname)) for fname in fnames)
|
|
|
|
def _initialize_model(self):
|
|
return
|
|
|
|
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
|
requested_parameters = super().get_requested_parameters(model_name, model_path, menu_path, parameters)
|
|
if not utils.koboldai_vars.hascuda:
|
|
logger.warning("Your GPU has not been detected and you can only make use of 32-bit inference, meaning the ram requirements are 8 times higher than specified on the menu and your generations will be slow.\nUnless this is an error and your GPU is known to be compatible with our software check out https://koboldai.org/cpp for a suitable alternative that has wider GPU support and has the ability to run models in 4-bit on the CPU.")
|
|
|
|
dependency_exists = importlib.util.find_spec("bitsandbytes")
|
|
if dependency_exists:
|
|
if model_name != 'customhuggingface' or "custom_model_name" in parameters:
|
|
if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
|
|
with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
|
|
temp = json.load(f)
|
|
else:
|
|
temp = {}
|
|
if not hasattr(self.model_config, 'quantization_config') and utils.koboldai_vars.hascuda:
|
|
requested_parameters.append({
|
|
"uitype": "dropdown",
|
|
"unit": "text",
|
|
"label": "Quantization",
|
|
"id": "quantization",
|
|
"default": temp['quantization'] if 'quantization' in temp else '4bit' if dependency_exists else '16-bit',
|
|
"tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode",
|
|
"menu_path": "Layers",
|
|
"children": [{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}, {'text': '16-bit', 'value':'16-bit'}],
|
|
"extra_classes": "",
|
|
"refresh_model_inputs": False
|
|
})
|
|
else:
|
|
logger.warning("Bitsandbytes is not installed, you can not use Quantization for Huggingface models")
|
|
return requested_parameters
|
|
|
|
def set_input_parameters(self, parameters):
|
|
super().set_input_parameters(parameters)
|
|
self.quantization = parameters['quantization'] if 'quantization' in parameters else False
|
|
|
|
def _load(self, save_model: bool, initial_load: bool) -> None:
|
|
utils.koboldai_vars.allowsp = True
|
|
|
|
# Make model path the same as the model name to make this consistent
|
|
# with the other loading method if it isn't a known model type. This
|
|
# code is not just a workaround for below, it is also used to make the
|
|
# behavior consistent with other loading methods - Henk717
|
|
# if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]:
|
|
# utils.koboldai_vars.custmodpth = utils.koboldai_vars.model
|
|
|
|
if self.model_name == "NeoCustom":
|
|
self.model_name = os.path.basename(os.path.normpath(self.path))
|
|
utils.koboldai_vars.model = self.model_name
|
|
|
|
# If we specify a model and it's in the root directory, we need to move
|
|
# it to the models directory (legacy folder structure to new)
|
|
if self.get_local_model_path(legacy=True):
|
|
shutil.move(
|
|
self.get_local_model_path(legacy=True, ignore_existance=True),
|
|
self.get_local_model_path(ignore_existance=True),
|
|
)
|
|
|
|
self.init_model_config()
|
|
|
|
tf_kwargs = {
|
|
"low_cpu_mem_usage": True,
|
|
"use_cache": True # Workaround for models that accidentally turn cache to false
|
|
}
|
|
|
|
if not hasattr(self.model_config, 'quantization_config'):
|
|
if self.quantization == "8bit":
|
|
tf_kwargs.update({
|
|
"quantization_config":BitsAndBytesConfig(
|
|
load_in_8bit=True,
|
|
llm_int8_enable_fp32_cpu_offload=True
|
|
),
|
|
})
|
|
|
|
if self.quantization == "4bit" or utils.koboldai_vars.colab_arg:
|
|
tf_kwargs.update({
|
|
"quantization_config":BitsAndBytesConfig(
|
|
load_in_4bit=True,
|
|
bnb_4bit_compute_dtype=torch.float16,
|
|
bnb_4bit_use_double_quant=True,
|
|
bnb_4bit_quant_type='nf4',
|
|
llm_int8_enable_fp32_cpu_offload=True
|
|
),
|
|
})
|
|
|
|
if self.model_type == "gpt2":
|
|
# We must disable low_cpu_mem_usage and quantization if using a GPT-2 model
|
|
# because GPT-2 is not compatible with these features yet.
|
|
tf_kwargs.pop("low_cpu_mem_usage", None)
|
|
tf_kwargs.pop("quantization_config", None)
|
|
|
|
# Also, lazy loader doesn't support GPT-2 models
|
|
self.lazy_load = False
|
|
|
|
if self.model_type == "llama":
|
|
tf_kwargs.update({
|
|
"pretraining_tp": 1 # Workaround recommended by HF to fix their mistake on the config.json tuners adopted
|
|
})
|
|
|
|
logger.debug(
|
|
"lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(
|
|
self.lazy_load,
|
|
utils.koboldai_vars.hascuda,
|
|
self.breakmodel,
|
|
self.nobreakmodel,
|
|
)
|
|
)
|
|
|
|
# If we're using torch_lazy_loader, we need to get breakmodel config
|
|
# early so that it knows where to load the individual model tensors
|
|
if (
|
|
self.lazy_load
|
|
and utils.koboldai_vars.hascuda
|
|
and utils.koboldai_vars.breakmodel
|
|
and not utils.koboldai_vars.nobreakmodel
|
|
):
|
|
self.breakmodel_device_config(self.model_config)
|
|
|
|
if self.lazy_load:
|
|
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
|
tf_kwargs.pop("low_cpu_mem_usage", None)
|
|
|
|
# If we're using lazy loader, we need to figure out what the model's hidden layers are called
|
|
with lazy_loader.use_lazy_load(dematerialized_modules=True):
|
|
try:
|
|
metamodel = AutoModelForCausalLM.from_config(self.model_config)
|
|
utils.layers_module_names = utils.get_layers_module_names(metamodel)
|
|
utils.module_names = list(metamodel.state_dict().keys())
|
|
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
|
|
except Exception as e:
|
|
if utils.args.panic:
|
|
raise e
|
|
logger.warning(f"Gave up on lazy loading due to {e}")
|
|
self.lazy_load = False
|
|
|
|
# Download model from Huggingface if it does not exist, otherwise load locally
|
|
if self.get_local_model_path():
|
|
# Model is stored locally, load it.
|
|
self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
|
|
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
|
|
else:
|
|
# Model not stored locally, we need to download it.
|
|
|
|
# _rebuild_tensor patch for casting dtype and supporting LazyTensors
|
|
old_rebuild_tensor = torch._utils._rebuild_tensor
|
|
|
|
def new_rebuild_tensor(
|
|
storage: Union[lazy_loader.LazyTensor, torch.Storage],
|
|
storage_offset,
|
|
shape,
|
|
stride,
|
|
):
|
|
if not isinstance(storage, lazy_loader.LazyTensor):
|
|
dtype = storage.dtype
|
|
else:
|
|
dtype = storage.storage_type.dtype
|
|
if not isinstance(dtype, torch.dtype):
|
|
dtype = storage.storage_type(0).dtype
|
|
if dtype is torch.float32 and len(shape) >= 2:
|
|
utils.koboldai_vars.fp32_model = True
|
|
return old_rebuild_tensor(storage, storage_offset, shape, stride)
|
|
|
|
torch._utils._rebuild_tensor = new_rebuild_tensor
|
|
self.model = self._get_model(self.model_name, tf_kwargs)
|
|
self.tokenizer = self._get_tokenizer(self.model_name)
|
|
torch._utils._rebuild_tensor = old_rebuild_tensor
|
|
|
|
if save_model:
|
|
self.tokenizer.save_pretrained(
|
|
self.get_local_model_path(ignore_existance=True)
|
|
)
|
|
|
|
if utils.koboldai_vars.fp32_model:
|
|
# Use save_pretrained to convert fp32 models to fp16,
|
|
# unless we are using disk cache because save_pretrained
|
|
# is not supported in that case
|
|
self.model = self.model.half()
|
|
self.model.save_pretrained(
|
|
self.get_local_model_path(ignore_existance=True),
|
|
max_shard_size="500MiB",
|
|
)
|
|
|
|
else:
|
|
# For fp16 models, we can just copy the model files directly
|
|
import transformers.configuration_utils
|
|
import transformers.modeling_utils
|
|
import transformers.file_utils
|
|
import huggingface_hub
|
|
|
|
# Save the config.json
|
|
shutil.move(
|
|
os.path.realpath(
|
|
huggingface_hub.hf_hub_download(
|
|
self.model_name,
|
|
transformers.configuration_utils.CONFIG_NAME,
|
|
revision=utils.koboldai_vars.revision,
|
|
cache_dir="cache",
|
|
local_files_only=True,
|
|
legacy_cache_layout=False,
|
|
)
|
|
),
|
|
os.path.join(
|
|
self.get_local_model_path(ignore_existance=True),
|
|
transformers.configuration_utils.CONFIG_NAME,
|
|
),
|
|
)
|
|
|
|
if utils.num_shards is None:
|
|
# Save the pytorch_model.bin or model.safetensors of an unsharded model
|
|
any_success = False
|
|
possible_checkpoint_names = [
|
|
transformers.modeling_utils.WEIGHTS_NAME,
|
|
"model.safetensors",
|
|
]
|
|
|
|
for possible_checkpoint_name in possible_checkpoint_names:
|
|
try:
|
|
shutil.move(
|
|
os.path.realpath(
|
|
huggingface_hub.hf_hub_download(
|
|
self.model_name,
|
|
possible_checkpoint_name,
|
|
revision=utils.koboldai_vars.revision,
|
|
cache_dir="cache",
|
|
local_files_only=True,
|
|
legacy_cache_layout=False,
|
|
)
|
|
),
|
|
os.path.join(
|
|
self.get_local_model_path(
|
|
ignore_existance=True
|
|
),
|
|
possible_checkpoint_name,
|
|
),
|
|
)
|
|
any_success = True
|
|
except Exception:
|
|
pass
|
|
|
|
if not any_success:
|
|
raise RuntimeError(
|
|
f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'"
|
|
)
|
|
else:
|
|
# Handle saving sharded models
|
|
|
|
with open(utils.from_pretrained_index_filename) as f:
|
|
map_data = json.load(f)
|
|
filenames = set(map_data["weight_map"].values())
|
|
# Save the pytorch_model.bin.index.json of a sharded model
|
|
shutil.move(
|
|
os.path.realpath(utils.from_pretrained_index_filename),
|
|
os.path.join(
|
|
self.get_local_model_path(ignore_existance=True),
|
|
transformers.modeling_utils.WEIGHTS_INDEX_NAME,
|
|
),
|
|
)
|
|
# Then save the pytorch_model-#####-of-#####.bin files
|
|
for filename in filenames:
|
|
shutil.move(
|
|
os.path.realpath(
|
|
huggingface_hub.hf_hub_download(
|
|
self.model_name,
|
|
filename,
|
|
revision=utils.koboldai_vars.revision,
|
|
cache_dir="cache",
|
|
local_files_only=True,
|
|
legacy_cache_layout=False,
|
|
)
|
|
),
|
|
os.path.join(
|
|
self.get_local_model_path(ignore_existance=True),
|
|
filename,
|
|
),
|
|
)
|
|
shutil.rmtree("cache/")
|
|
|
|
self.patch_embedding()
|
|
|
|
self.model.kai_model = self
|
|
utils.koboldai_vars.modeldim = self.get_hidden_size()
|
|
|
|
def _save_settings(self):
|
|
with open(
|
|
"settings/{}.generic_hf_torch.model_backend.settings".format(
|
|
self.model_name.replace("/", "_")
|
|
),
|
|
"w",
|
|
) as f:
|
|
json.dump(
|
|
{
|
|
"layers": self.layers if "layers" in vars(self) else [],
|
|
"disk_layers": self.disk_layers
|
|
if "disk_layers" in vars(self)
|
|
else 0,
|
|
"quantization": self.quantization,
|
|
},
|
|
f,
|
|
indent="",
|
|
)
|