mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
294 lines
13 KiB
Python
294 lines
13 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
import json
|
|
import torch
|
|
import shutil
|
|
from typing import Union
|
|
|
|
from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel
|
|
|
|
import utils
|
|
import modeling.lazy_loader as lazy_loader
|
|
import koboldai_settings
|
|
import importlib
|
|
from logger import logger
|
|
|
|
|
|
from modeling.inference_models.hf_torch import HFTorchInferenceModel
|
|
|
|
model_backend_name = "Huggingface"
|
|
model_backend_type = "Huggingface" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)
|
|
|
|
class model_backend(HFTorchInferenceModel):
|
|
|
|
def _initialize_model(self):
|
|
return
|
|
|
|
def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
|
|
requested_parameters = super().get_requested_parameters(model_name, model_path, menu_path, parameters)
|
|
dependency_exists = importlib.util.find_spec("bitsandbytes")
|
|
if dependency_exists:
|
|
if model_name != 'customhuggingface' or "custom_model_name" in parameters:
|
|
requested_parameters.append({
|
|
"uitype": "toggle",
|
|
"unit": "bool",
|
|
"label": "Use 4-bit",
|
|
"id": "use_4_bit",
|
|
"default": False,
|
|
"tooltip": "Whether or not to use BnB's 4-bit mode",
|
|
"menu_path": "Layers",
|
|
"extra_classes": "",
|
|
"refresh_model_inputs": False
|
|
})
|
|
else:
|
|
logger.warning("Bitsandbytes is not installed, you can not use Huggingface models in 4-bit")
|
|
return requested_parameters
|
|
|
|
def set_input_parameters(self, parameters):
|
|
super().set_input_parameters(parameters)
|
|
self.use_4_bit = parameters['use_4_bit']
|
|
|
|
def _load(self, save_model: bool, initial_load: bool) -> None:
|
|
utils.koboldai_vars.allowsp = True
|
|
|
|
# Make model path the same as the model name to make this consistent
|
|
# with the other loading method if it isn't a known model type. This
|
|
# code is not just a workaround for below, it is also used to make the
|
|
# behavior consistent with other loading methods - Henk717
|
|
# if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]:
|
|
# utils.koboldai_vars.custmodpth = utils.koboldai_vars.model
|
|
|
|
if self.model_name == "NeoCustom":
|
|
self.model_name = os.path.basename(os.path.normpath(self.path))
|
|
utils.koboldai_vars.model = self.model_name
|
|
|
|
# If we specify a model and it's in the root directory, we need to move
|
|
# it to the models directory (legacy folder structure to new)
|
|
if self.get_local_model_path(legacy=True):
|
|
shutil.move(
|
|
self.get_local_model_path(legacy=True, ignore_existance=True),
|
|
self.get_local_model_path(ignore_existance=True),
|
|
)
|
|
|
|
self.init_model_config()
|
|
|
|
tf_kwargs = {
|
|
"low_cpu_mem_usage": True,
|
|
}
|
|
|
|
if self.use_4_bit:
|
|
self.lazy_load = False
|
|
tf_kwargs.update({
|
|
"load_in_4bit": True,
|
|
})
|
|
|
|
if self.model_type == "gpt2":
|
|
# We must disable low_cpu_mem_usage and if using a GPT-2 model
|
|
# because GPT-2 is not compatible with this feature yet.
|
|
tf_kwargs.pop("low_cpu_mem_usage", None)
|
|
|
|
# Also, lazy loader doesn't support GPT-2 models
|
|
self.lazy_load = False
|
|
|
|
logger.debug(
|
|
"lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(
|
|
self.lazy_load,
|
|
utils.koboldai_vars.hascuda,
|
|
self.breakmodel,
|
|
self.nobreakmodel,
|
|
)
|
|
)
|
|
|
|
# If we're using torch_lazy_loader, we need to get breakmodel config
|
|
# early so that it knows where to load the individual model tensors
|
|
if (
|
|
self.lazy_load
|
|
and utils.koboldai_vars.hascuda
|
|
and utils.koboldai_vars.breakmodel
|
|
and not utils.koboldai_vars.nobreakmodel
|
|
):
|
|
self.breakmodel_device_config(self.model_config)
|
|
|
|
if self.lazy_load:
|
|
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
|
tf_kwargs.pop("low_cpu_mem_usage", None)
|
|
|
|
# If we're using lazy loader, we need to figure out what the model's hidden layers are called
|
|
with lazy_loader.use_lazy_load(dematerialized_modules=True):
|
|
try:
|
|
metamodel = AutoModelForCausalLM.from_config(self.model_config)
|
|
utils.layers_module_names = utils.get_layers_module_names(metamodel)
|
|
utils.module_names = list(metamodel.state_dict().keys())
|
|
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
|
|
except Exception as e:
|
|
if utils.args.panic:
|
|
raise e
|
|
logger.warning(f"Gave up on lazy loading due to {e}")
|
|
self.lazy_load = False
|
|
|
|
# Download model from Huggingface if it does not exist, otherwise load locally
|
|
if self.get_local_model_path():
|
|
# Model is stored locally, load it.
|
|
self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
|
|
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
|
|
else:
|
|
# Model not stored locally, we need to download it.
|
|
|
|
# _rebuild_tensor patch for casting dtype and supporting LazyTensors
|
|
old_rebuild_tensor = torch._utils._rebuild_tensor
|
|
|
|
def new_rebuild_tensor(
|
|
storage: Union[lazy_loader.LazyTensor, torch.Storage],
|
|
storage_offset,
|
|
shape,
|
|
stride,
|
|
):
|
|
if not isinstance(storage, lazy_loader.LazyTensor):
|
|
dtype = storage.dtype
|
|
else:
|
|
dtype = storage.storage_type.dtype
|
|
if not isinstance(dtype, torch.dtype):
|
|
dtype = storage.storage_type(0).dtype
|
|
if dtype is torch.float32 and len(shape) >= 2:
|
|
utils.koboldai_vars.fp32_model = True
|
|
return old_rebuild_tensor(storage, storage_offset, shape, stride)
|
|
|
|
torch._utils._rebuild_tensor = new_rebuild_tensor
|
|
self.model = self._get_model(self.model_name, tf_kwargs)
|
|
self.tokenizer = self._get_tokenizer(self.model_name)
|
|
torch._utils._rebuild_tensor = old_rebuild_tensor
|
|
|
|
if save_model:
|
|
self.tokenizer.save_pretrained(
|
|
self.get_local_model_path(ignore_existance=True)
|
|
)
|
|
|
|
if utils.koboldai_vars.fp32_model:
|
|
# Use save_pretrained to convert fp32 models to fp16,
|
|
# unless we are using disk cache because save_pretrained
|
|
# is not supported in that case
|
|
self.model = self.model.half()
|
|
self.model.save_pretrained(
|
|
self.get_local_model_path(ignore_existance=True),
|
|
max_shard_size="500MiB",
|
|
)
|
|
|
|
else:
|
|
# For fp16 models, we can just copy the model files directly
|
|
import transformers.configuration_utils
|
|
import transformers.modeling_utils
|
|
import transformers.file_utils
|
|
import huggingface_hub
|
|
|
|
# Save the config.json
|
|
shutil.move(
|
|
os.path.realpath(
|
|
huggingface_hub.hf_hub_download(
|
|
self.model_name,
|
|
transformers.configuration_utils.CONFIG_NAME,
|
|
revision=utils.koboldai_vars.revision,
|
|
cache_dir="cache",
|
|
local_files_only=True,
|
|
legacy_cache_layout=False,
|
|
)
|
|
),
|
|
os.path.join(
|
|
self.get_local_model_path(ignore_existance=True),
|
|
transformers.configuration_utils.CONFIG_NAME,
|
|
),
|
|
)
|
|
|
|
if utils.num_shards is None:
|
|
# Save the pytorch_model.bin or model.safetensors of an unsharded model
|
|
any_success = False
|
|
possible_checkpoint_names = [
|
|
transformers.modeling_utils.WEIGHTS_NAME,
|
|
"model.safetensors",
|
|
]
|
|
|
|
for possible_checkpoint_name in possible_checkpoint_names:
|
|
try:
|
|
shutil.move(
|
|
os.path.realpath(
|
|
huggingface_hub.hf_hub_download(
|
|
self.model_name,
|
|
possible_checkpoint_name,
|
|
revision=utils.koboldai_vars.revision,
|
|
cache_dir="cache",
|
|
local_files_only=True,
|
|
legacy_cache_layout=False,
|
|
)
|
|
),
|
|
os.path.join(
|
|
self.get_local_model_path(
|
|
ignore_existance=True
|
|
),
|
|
possible_checkpoint_name,
|
|
),
|
|
)
|
|
any_success = True
|
|
except Exception:
|
|
pass
|
|
|
|
if not any_success:
|
|
raise RuntimeError(
|
|
f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'"
|
|
)
|
|
else:
|
|
# Handle saving sharded models
|
|
|
|
with open(utils.from_pretrained_index_filename) as f:
|
|
map_data = json.load(f)
|
|
filenames = set(map_data["weight_map"].values())
|
|
# Save the pytorch_model.bin.index.json of a sharded model
|
|
shutil.move(
|
|
os.path.realpath(utils.from_pretrained_index_filename),
|
|
os.path.join(
|
|
self.get_local_model_path(ignore_existance=True),
|
|
transformers.modeling_utils.WEIGHTS_INDEX_NAME,
|
|
),
|
|
)
|
|
# Then save the pytorch_model-#####-of-#####.bin files
|
|
for filename in filenames:
|
|
shutil.move(
|
|
os.path.realpath(
|
|
huggingface_hub.hf_hub_download(
|
|
self.model_name,
|
|
filename,
|
|
revision=utils.koboldai_vars.revision,
|
|
cache_dir="cache",
|
|
local_files_only=True,
|
|
legacy_cache_layout=False,
|
|
)
|
|
),
|
|
os.path.join(
|
|
self.get_local_model_path(ignore_existance=True),
|
|
filename,
|
|
),
|
|
)
|
|
shutil.rmtree("cache/")
|
|
|
|
self.patch_embedding()
|
|
|
|
self.model.kai_model = self
|
|
utils.koboldai_vars.modeldim = self.get_hidden_size()
|
|
|
|
def _save_settings(self):
|
|
with open(
|
|
"settings/{}.generic_hf_torch.model_backend.settings".format(
|
|
self.model_name.replace("/", "_")
|
|
),
|
|
"w",
|
|
) as f:
|
|
json.dump(
|
|
{
|
|
"layers": self.layers if "layers" in vars(self) else [],
|
|
"disk_layers": self.disk_layers
|
|
if "disk_layers" in vars(self)
|
|
else 0,
|
|
},
|
|
f,
|
|
indent="",
|
|
)
|