KoboldAI-Client/modeling/inference_models/generic_hf_torch/class.py

from __future__ import annotations

import os
import json
import torch
import shutil
from typing import Union

from transformers import GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
try:
    from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
    from transformers import AutoModelForCausalLM

from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME

import utils
import modeling.lazy_loader as lazy_loader
import koboldai_settings
import importlib
from logger import logger


from modeling.inference_models.hf_torch import HFTorchInferenceModel

model_backend_name = "Huggingface"
model_backend_type = "Huggingface" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)

class model_backend(HFTorchInferenceModel):
    def __init__(self) -> None:
        super().__init__()
        self.quantization = False

    def is_valid(self, model_name, model_path, menu_path):
        base_is_valid = super().is_valid(model_name, model_path, menu_path)
        path = False
        gen_path = "models/{}".format(model_name.replace('/', '_'))
        if model_path is not None and os.path.exists(model_path):
            path = model_path
        elif os.path.exists(gen_path):
            path = gen_path

        fnames = [WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME]

        return base_is_valid and any(os.path.exists(os.path.join(path, fname)) for fname in fnames)

    def _initialize_model(self):
        return

    def get_requested_parameters(self, model_name, model_path, menu_path, parameters = {}):
        requested_parameters = super().get_requested_parameters(model_name, model_path, menu_path, parameters)
        if not utils.koboldai_vars.hascuda:
            logger.warning("Your GPU has not been detected and you can only make use of 32-bit inference, meaning the ram requirements are 8 times higher than specified on the menu and your generations will be slow.\nUnless this is an error and your GPU is known to be compatible with our software check out https://koboldai.org/cpp for a suitable alternative that has wider GPU support and has the ability to run models in 4-bit on the CPU.")

        dependency_exists = importlib.util.find_spec("bitsandbytes")
        if dependency_exists:
            if model_name != 'customhuggingface' or "custom_model_name" in parameters:
                if os.path.exists("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_"))) and 'base_url' not in vars(self):
                    with open("settings/{}.generic_hf_torch.model_backend.settings".format(model_name.replace("/", "_")), "r") as f:
                        temp = json.load(f)
                else:
                    temp = {}
                if not hasattr(self.model_config, 'quantization_config') and utils.koboldai_vars.hascuda:
                    requested_parameters.append({
                                                "uitype": "dropdown",
                                                "unit": "text",
                                                "label": "Quantization",
                                                "id": "quantization",
                                                "default": temp['quantization'] if 'quantization' in temp else '4bit' if dependency_exists else '16-bit',
                                                "tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode",
                                                "menu_path": "Layers",
                                                "children": [{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}, {'text': '16-bit', 'value':'16-bit'}],
                                                "extra_classes": "",
                                                "refresh_model_inputs": False
                                            })
        else:
            logger.warning("Bitsandbytes is not installed, you can not use Quantization for Huggingface models")
        return requested_parameters

    def set_input_parameters(self, parameters):
        super().set_input_parameters(parameters)
        self.quantization = parameters['quantization'] if 'quantization' in parameters else False

    def _load(self, save_model: bool, initial_load: bool) -> None:
        utils.koboldai_vars.allowsp = True

        # Make model path the same as the model name to make this consistent
        # with the other loading method if it isn't a known model type. This
        # code is not just a workaround for below, it is also used to make the
        # behavior consistent with other loading methods - Henk717
        # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]:
        #     utils.koboldai_vars.custmodpth = utils.koboldai_vars.model

        if self.model_name == "NeoCustom":
            self.model_name = os.path.basename(os.path.normpath(self.path))
        utils.koboldai_vars.model = self.model_name

        # If we specify a model and it's in the root directory, we need to move
        # it to the models directory (legacy folder structure to new)
        if self.get_local_model_path(legacy=True):
            shutil.move(
                self.get_local_model_path(legacy=True, ignore_existance=True),
                self.get_local_model_path(ignore_existance=True),
            )

        self.init_model_config()

        tf_kwargs = {
            "low_cpu_mem_usage": True,
            "use_cache": True # Workaround for models that accidentally turn cache to false
        }

        if not hasattr(self.model_config, 'quantization_config'):
            if self.quantization == "8bit":
                tf_kwargs.update({
                    "quantization_config":BitsAndBytesConfig(
                        load_in_8bit=True,
                        llm_int8_enable_fp32_cpu_offload=True
                    ),
                })

            if self.quantization == "4bit" or utils.koboldai_vars.colab_arg:
                tf_kwargs.update({
                    "quantization_config":BitsAndBytesConfig(
                        load_in_4bit=True,
                        bnb_4bit_compute_dtype=torch.float16,
                        bnb_4bit_use_double_quant=True,
                        bnb_4bit_quant_type='nf4',
                        llm_int8_enable_fp32_cpu_offload=True
                    ),
                })

        if self.model_type == "gpt2":
            # We must disable low_cpu_mem_usage and quantization if using a GPT-2 model
            # because GPT-2 is not compatible with these features yet.
            tf_kwargs.pop("low_cpu_mem_usage", None)
            tf_kwargs.pop("quantization_config", None)

            # Also, lazy loader doesn't support GPT-2 models
            self.lazy_load = False

        if self.model_type == "llama":
            tf_kwargs.update({
                "pretraining_tp": 1 # Workaround recommended by HF to fix their mistake on the config.json tuners adopted
            })

        logger.debug(
            "lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(
                self.lazy_load,
                utils.koboldai_vars.hascuda,
                self.breakmodel,
                self.nobreakmodel,
            )
        )

        # If we're using torch_lazy_loader, we need to get breakmodel config
        # early so that it knows where to load the individual model tensors
        if (
            self.lazy_load
            and utils.koboldai_vars.hascuda
            and utils.koboldai_vars.breakmodel
            and not utils.koboldai_vars.nobreakmodel
        ):
            self.breakmodel_device_config(self.model_config)

        if self.lazy_load:
            # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
            tf_kwargs.pop("low_cpu_mem_usage", None)

            # If we're using lazy loader, we need to figure out what the model's hidden layers are called
            with lazy_loader.use_lazy_load(dematerialized_modules=True):
                try:
                    metamodel = AutoModelForCausalLM.from_config(self.model_config)
                    utils.layers_module_names = utils.get_layers_module_names(metamodel)
                    utils.module_names = list(metamodel.state_dict().keys())
                    utils.named_buffers = list(metamodel.named_buffers(recurse=True))
                except Exception as e:
                    if utils.args.panic:
                        raise e
                    logger.warning(f"Gave up on lazy loading due to {e}")
                    self.lazy_load = False

        # Download model from Huggingface if it does not exist, otherwise load locally
        if self.get_local_model_path():
            # Model is stored locally, load it.
            self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
            self.tokenizer = self._get_tokenizer(self.get_local_model_path())
        else:
            # Model not stored locally, we need to download it.

            # _rebuild_tensor patch for casting dtype and supporting LazyTensors
            old_rebuild_tensor = torch._utils._rebuild_tensor

            def new_rebuild_tensor(
                storage: Union[lazy_loader.LazyTensor, torch.Storage],
                storage_offset,
                shape,
                stride,
            ):
                if not isinstance(storage, lazy_loader.LazyTensor):
                    dtype = storage.dtype
                else:
                    dtype = storage.storage_type.dtype
                    if not isinstance(dtype, torch.dtype):
                        dtype = storage.storage_type(0).dtype
                if dtype is torch.float32 and len(shape) >= 2:
                    utils.koboldai_vars.fp32_model = True
                return old_rebuild_tensor(storage, storage_offset, shape, stride)

            torch._utils._rebuild_tensor = new_rebuild_tensor
            self.model = self._get_model(self.model_name, tf_kwargs)
            self.tokenizer = self._get_tokenizer(self.model_name)
            torch._utils._rebuild_tensor = old_rebuild_tensor

            if save_model:
                self.tokenizer.save_pretrained(
                    self.get_local_model_path(ignore_existance=True)
                )

                if utils.koboldai_vars.fp32_model:
                    # Use save_pretrained to convert fp32 models to fp16,
                    # unless we are using disk cache because save_pretrained
                    # is not supported in that case
                    self.model = self.model.half()
                    self.model.save_pretrained(
                        self.get_local_model_path(ignore_existance=True),
                        max_shard_size="500MiB",
                    )

                else:
                    # For fp16 models, we can just copy the model files directly
                    import transformers.configuration_utils
                    import transformers.modeling_utils
                    import transformers.file_utils
                    import huggingface_hub

                    # Save the config.json
                    shutil.move(
                        os.path.realpath(
                            huggingface_hub.hf_hub_download(
                                self.model_name,
                                transformers.configuration_utils.CONFIG_NAME,
                                revision=utils.koboldai_vars.revision,
                                cache_dir="cache",
                                local_files_only=True,
                                legacy_cache_layout=False,
                            )
                        ),
                        os.path.join(
                            self.get_local_model_path(ignore_existance=True),
                            transformers.configuration_utils.CONFIG_NAME,
                        ),
                    )

                    if utils.num_shards is None:
                        # Save the pytorch_model.bin or model.safetensors of an unsharded model
                        any_success = False
                        possible_checkpoint_names = [
                            transformers.modeling_utils.WEIGHTS_NAME,
                            "model.safetensors",
                        ]

                        for possible_checkpoint_name in possible_checkpoint_names:
                            try:
                                shutil.move(
                                    os.path.realpath(
                                        huggingface_hub.hf_hub_download(
                                            self.model_name,
                                            possible_checkpoint_name,
                                            revision=utils.koboldai_vars.revision,
                                            cache_dir="cache",
                                            local_files_only=True,
                                            legacy_cache_layout=False,
                                        )
                                    ),
                                    os.path.join(
                                        self.get_local_model_path(
                                            ignore_existance=True
                                        ),
                                        possible_checkpoint_name,
                                    ),
                                )
                                any_success = True
                            except Exception:
                                pass

                        if not any_success:
                            raise RuntimeError(
                                f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'"
                            )
                    else:
                        # Handle saving sharded models

                        with open(utils.from_pretrained_index_filename) as f:
                            map_data = json.load(f)
                        filenames = set(map_data["weight_map"].values())
                        # Save the pytorch_model.bin.index.json of a sharded model
                        shutil.move(
                            os.path.realpath(utils.from_pretrained_index_filename),
                            os.path.join(
                                self.get_local_model_path(ignore_existance=True),
                                transformers.modeling_utils.WEIGHTS_INDEX_NAME,
                            ),
                        )
                        # Then save the pytorch_model-#####-of-#####.bin files
                        for filename in filenames:
                            shutil.move(
                                os.path.realpath(
                                    huggingface_hub.hf_hub_download(
                                        self.model_name,
                                        filename,
                                        revision=utils.koboldai_vars.revision,
                                        cache_dir="cache",
                                        local_files_only=True,
                                        legacy_cache_layout=False,
                                    )
                                ),
                                os.path.join(
                                    self.get_local_model_path(ignore_existance=True),
                                    filename,
                                ),
                            )
                shutil.rmtree("cache/")

        self.patch_embedding()

        self.model.kai_model = self
        utils.koboldai_vars.modeldim = self.get_hidden_size()

    def _save_settings(self):
        with open(
            "settings/{}.generic_hf_torch.model_backend.settings".format(
                self.model_name.replace("/", "_")
            ),
            "w",
        ) as f:
            json.dump(
                {
                    "layers": self.layers if "layers" in vars(self) else [],
                    "disk_layers": self.disk_layers
                    if "disk_layers" in vars(self)
                    else 0,
                    "quantization": self.quantization,
                },
                f,
                indent="",
            )