From 4d34f9b7de03c6843e05cf5e11864d6b180a07b5 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 16 Apr 2023 14:20:13 +0200 Subject: [PATCH] Move 4-bit loading code to separate inference_model file --- aiserver.py | 91 ++--- modeling/inference_models/hf_torch_4bit.py | 385 +++++++++++++++++++++ 2 files changed, 412 insertions(+), 64 deletions(-) create mode 100644 modeling/inference_models/hf_torch_4bit.py diff --git a/aiserver.py b/aiserver.py index 7e9241f5..0a98d16f 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1776,56 +1776,6 @@ def unload_model(): #Reload our badwords koboldai_vars.badwordsids = koboldai_settings.badwordsids_default - - -def prepare_4bit_load(modelpath): - paths_4bit = ["4bit*.safetensors", "4bit*.pt"] - paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] - result = False - groupsize = -1 - for p in paths_4bit: - p = os.path.join(modelpath, p) - val = [v for v in glob.glob(p) if "4bit-old" not in v] - if val: - result = val[0] - fname = Path(result).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname) - if g: - groupsize = int(g[0]) - break - - global monkey_patched_4bit - - # Monkey-patch in old-format pt-file support - if not result: - print("4-bit file not found, falling back to old format.") - for p in paths_4bit_old: - p = os.path.join(modelpath, p) - if os.path.isfile(p): - result = p - break - - if not result: - print("4-bit old-format file not found, loading failed.") - raise RuntimeError(f"4-bit load failed. PT-File not found.") - - import llama, opt, gptneox, gptj, old_quant - llama.make_quant = old_quant.old_make_quant - opt.make_quant = old_quant.old_make_quant - gptneox.make_quant = old_quant.old_make_quant - gptj.make_quant = old_quant.old_make_quant - monkey_patched_4bit = True - elif monkey_patched_4bit: - # Undo monkey patch - print("Undoing 4-bit old format monkey patch") - import llama, opt, gptneox, gptj, quant - llama.make_quant = quant.make_quant - opt.make_quant = quant.make_quant - gptneox.make_quant = quant.make_quant - gptj.make_quant = quant.make_quant - monkey_patched_4bit = False - - return result, groupsize def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=False, online_model="", use_breakmodel_args=False, breakmodel_args_default_to_cpu=False, url=None, use_8_bit=False, use_4_bit=False): @@ -2008,9 +1958,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal except: pass - try: - from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel - model = GenericHFTorchInferenceModel( + if use_4_bit: + from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel + model = HFTorch4BitInferenceModel( koboldai_vars.model, lazy_load=koboldai_vars.lazy_load, low_mem=args.lowmem @@ -2020,18 +1970,31 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal save_model=not (args.colab or args.cacheonly) or args.savemodel, initial_load=initial_load, ) - except SuperLegacyModelError: - from modeling.inference_models.legacy_gpt2_hf import CustomGPT2HFTorchInferenceModel - model = CustomGPT2HFTorchInferenceModel( - koboldai_vars.model, - lazy_load=koboldai_vars.lazy_load, - low_mem=args.lowmem - ) + else: + try: + from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel + model = GenericHFTorchInferenceModel( + koboldai_vars.model, + lazy_load=koboldai_vars.lazy_load, + low_mem=args.lowmem + ) - model.load( - save_model=not (args.colab or args.cacheonly) or args.savemodel, - initial_load=initial_load, - ) + model.load( + save_model=not (args.colab or args.cacheonly) or args.savemodel, + initial_load=initial_load, + ) + except SuperLegacyModelError: + from modeling.inference_models.legacy_gpt2_hf import CustomGPT2HFTorchInferenceModel + model = CustomGPT2HFTorchInferenceModel( + koboldai_vars.model, + lazy_load=koboldai_vars.lazy_load, + low_mem=args.lowmem + ) + + model.load( + save_model=not (args.colab or args.cacheonly) or args.savemodel, + initial_load=initial_load, + ) logger.info(f"Pipeline created: {koboldai_vars.model}") else: diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py new file mode 100644 index 00000000..21f4ebfe --- /dev/null +++ b/modeling/inference_models/hf_torch_4bit.py @@ -0,0 +1,385 @@ +from __future__ import annotations + +import os +import json +import torch +import re +import shutil +import sys +from typing import Union + +from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer +from modeling.inference_model import SuperLegacyModelError + +import utils +import modeling.lazy_loader as lazy_loader +import koboldai_settings +from logger import logger, set_logger_verbosity, quiesce_logger + +try: + import breakmodel +except ModuleNotFoundError as e: + # Breakmodel is only expected to work on GPU + if not utils.koboldai_vars.use_colab_tpu: + raise e + +from modeling.inference_models.hf_torch import HFTorchInferenceModel + +# 4-bit dependencies +from pathlib import Path +import glob +sys.path.insert(0, os.path.abspath(Path("repos/gptq"))) +from gptj import load_quant as gptj_load_quant +from gptneox import load_quant as gptneox_load_quant +from llama import load_quant as llama_load_quant +from opt import load_quant as opt_load_quant +from offload import load_quant_offload +monkey_patched_4bit = False + + +def prepare_4bit_load(modelpath): + paths_4bit = ["4bit*.safetensors", "4bit*.pt"] + paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] + result = False + groupsize = -1 + for p in paths_4bit: + p = os.path.join(modelpath, p) + val = [v for v in glob.glob(p) if "4bit-old" not in v] + if val: + result = val[0] + fname = Path(result).parts[-1] + g = re.findall("^(?:4bit)(?:-)(\d+)(?:g-?)", fname) + if g: + groupsize = int(g[0]) + break + + global monkey_patched_4bit + + # Monkey-patch in old-format pt-file support + if not result: + print("4-bit file not found, falling back to old format.") + for p in paths_4bit_old: + p = os.path.join(modelpath, p) + if os.path.isfile(p): + result = p + break + + if not result: + print("4-bit old-format file not found, loading failed.") + raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.") + + import llama, opt, gptneox, gptj, old_quant + llama.make_quant = old_quant.old_make_quant + opt.make_quant = old_quant.old_make_quant + gptneox.make_quant = old_quant.old_make_quant + gptj.make_quant = old_quant.old_make_quant + monkey_patched_4bit = True + elif monkey_patched_4bit: + # Undo monkey patch + print("Undoing 4-bit old format monkey patch") + import llama, opt, gptneox, gptj, quant + llama.make_quant = quant.make_quant + opt.make_quant = quant.make_quant + gptneox.make_quant = quant.make_quant + gptj.make_quant = quant.make_quant + monkey_patched_4bit = False + + return result, groupsize + + +class HFTorch4BitInferenceModel(HFTorchInferenceModel): + def _load(self, save_model: bool, initial_load: bool) -> None: + utils.koboldai_vars.allowsp = True + + # Make model path the same as the model name to make this consistent + # with the other loading method if it isn't a known model type. This + # code is not just a workaround for below, it is also used to make the + # behavior consistent with other loading methods - Henk717 + # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]: + # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model + + if self.model_name == "NeoCustom": + self.model_name = os.path.basename( + os.path.normpath(utils.koboldai_vars.custmodpth) + ) + utils.koboldai_vars.model = self.model_name + + self.lazy_load = False + + self.init_model_config() + + gpulayers = utils.args.breakmodel_gpulayers + + try: + gpu_layers_list = [int(l) for l in gpulayers.split(",")] + except ValueError: + gpu_layers_list = [utils.num_layers(self.model_config)] + self.offload_4bit = sum(gpu_layers_list) < utils.num_layers(self.model_config) + + if self.offload_4bit: + utils.koboldai_vars.lazy_load = False + print("4-bit CPU offloader active") + + tf_kwargs = { + "low_cpu_mem_usage": True, + } + + # If we're using torch_lazy_loader, we need to get breakmodel config + # early so that it knows where to load the individual model tensors + if ( + self.lazy_load + and utils.koboldai_vars.hascuda + and utils.koboldai_vars.breakmodel + and not utils.koboldai_vars.nobreakmodel + ): + self.breakmodel_device_config(self.model_config) + + if self.lazy_load: + # If we're using lazy loader, we need to figure out what the model's hidden layers are called + with lazy_loader.use_lazy_load( + dematerialized_modules=True, use_accelerate_init_empty_weights=True + ): + try: + metamodel = AutoModelForCausalLM.from_config(self.model_config) + except Exception as e: + logger.error(f"Fell back to neo for metamodel due to {e}") + try: + metamodel = GPTNeoForCausalLM.from_config(self.model_config) + except Exception as e: + logger.error(f"Falling back again due to {e}") + raise SuperLegacyModelError + + utils.layers_module_names = utils.get_layers_module_names(metamodel) + utils.module_names = list(metamodel.state_dict().keys()) + utils.named_buffers = list(metamodel.named_buffers(recurse=True)) + + # Download model from Huggingface if it does not exist, otherwise load locally + with self._maybe_use_float16(), lazy_loader.use_lazy_load( + enable=self.lazy_load, + callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) + if self.lazy_load + else None, + dematerialized_modules=True, + ): + if self.lazy_load: + # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time + tf_kwargs.pop("low_cpu_mem_usage", None) + + if self.get_local_model_path(): + # Model is stored locally, load it. + self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + else: + # Model not stored locally, we need to download it. + + # _rebuild_tensor patch for casting dtype and supporting LazyTensors + old_rebuild_tensor = torch._utils._rebuild_tensor + + def new_rebuild_tensor( + storage: Union[lazy_loader.LazyTensor, torch.Storage], + storage_offset, + shape, + stride, + ): + if not isinstance(storage, lazy_loader.LazyTensor): + dtype = storage.dtype + else: + dtype = storage.storage_type.dtype + if not isinstance(dtype, torch.dtype): + dtype = storage.storage_type(0).dtype + if dtype is torch.float32 and len(shape) >= 2: + utils.koboldai_vars.fp32_model = True + return old_rebuild_tensor(storage, storage_offset, shape, stride) + + torch._utils._rebuild_tensor = new_rebuild_tensor + self.model = self._get_model(self.model_name, tf_kwargs) + self.tokenizer = self._get_tokenizer(self.model_name) + torch._utils._rebuild_tensor = old_rebuild_tensor + + if save_model: + self.tokenizer.save_pretrained( + self.get_local_model_path(ignore_existance=True) + ) + + if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks: + # Use save_pretrained to convert fp32 models to fp16, + # unless we are using disk cache because save_pretrained + # is not supported in that case + self.model = self.model.half() + self.model.save_pretrained( + self.get_local_model_path(ignore_existance=True), + max_shard_size="500MiB", + ) + + else: + # For fp16 models, we can just copy the model files directly + import transformers.configuration_utils + import transformers.modeling_utils + import transformers.file_utils + import huggingface_hub + + # Save the config.json + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, + transformers.configuration_utils.CONFIG_NAME, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.configuration_utils.CONFIG_NAME, + ), + ) + + if utils.num_shards is None: + # Save the pytorch_model.bin or model.safetensors of an unsharded model + any_success = False + possible_checkpoint_names = [ + transformers.modeling_utils.WEIGHTS_NAME, + "model.safetensors", + ] + + for possible_checkpoint_name in possible_checkpoint_names: + try: + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, + possible_checkpoint_name, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path( + ignore_existance=True + ), + possible_checkpoint_name, + ), + ) + any_success = True + except Exception: + pass + + if not any_success: + raise RuntimeError(f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'") + else: + # Handle saving sharded models + + with open(utils.from_pretrained_index_filename) as f: + map_data = json.load(f) + filenames = set(map_data["weight_map"].values()) + # Save the pytorch_model.bin.index.json of a sharded model + shutil.move( + os.path.realpath(utils.from_pretrained_index_filename), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.modeling_utils.WEIGHTS_INDEX_NAME, + ), + ) + # Then save the pytorch_model-#####-of-#####.bin files + for filename in filenames: + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, + filename, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path( + ignore_existance=True + ), + filename, + ), + ) + shutil.rmtree("cache/") + + if not self.lazy_load: + utils.layers_module_names = utils.get_layers_module_names(self.model) + utils.module_names = list(self.model.state_dict().keys()) + utils.named_buffers = list(self.model.named_buffers(recurse=True)) + + if ( + utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default + and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj") + ): + utils.koboldai_vars.badwordsids = [ + [v] + for k, v in self.tokenizer.get_vocab().items() + if any(c in str(k) for c in "[]") + ] + + self.patch_embedding() + + if utils.koboldai_vars.hascuda: + if utils.koboldai_vars.usegpu: + # Use just VRAM + self.model = self.model.half().to(utils.koboldai_vars.gpu_device) + elif utils.koboldai_vars.breakmodel: + # Use both RAM and VRAM (breakmodel) + if not self.lazy_load: + self.breakmodel_device_config(self.model.config) + self._move_to_devices() + elif breakmodel.disk_blocks > 0: + # Use disk + self._move_to_devices() + else: + # Use CPU + self.model = self.model.to("cpu").float() + elif breakmodel.disk_blocks > 0: + self._move_to_devices() + else: + self.model = self.model.to("cpu").float() + + self.model.kai_model = self + utils.koboldai_vars.modeldim = self.get_hidden_size() + + def _get_model(self, location: str, tf_kwargs: Dict): + path_4bit, groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth) + print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}") + + print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit") + if utils.koboldai_vars.model_type == "gptj": + if self.offload_4bit: + model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = gptj_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + elif utils.koboldai_vars.model_type == "gpt_neox": + if self.offload_4bit: + model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = gptneox_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + elif utils.koboldai_vars.model_type == "llama": + if self.offload_4bit: + model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = llama_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + elif utils.koboldai_vars.model_type == "opt": + if self.offload_4bit: + model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, gpu_layers_list) + else: + model = opt_load_quant(utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize) + else: + raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit") + + return model.half() + + def _get_tokenizer(self, location: str): + if utils.koboldai_vars.model_type == "llama": + tokenizer = LlamaTokenizer.from_pretrained(utils.koboldai_vars.custmodpth) + else: + tokenizer = AutoTokenizer.from_pretrained(utils.koboldai_vars.custmodpth) + + return tokenizer