From 1df03d9a27b86a086abafe80aefc0db67aa8e3f0 Mon Sep 17 00:00:00 2001 From: somebody Date: Sun, 23 Jul 2023 20:54:04 -0500 Subject: [PATCH] Basic --- .../inference_models/gptq_hf_torch/class.py | 94 +++++++++++-------- modeling/lazy_loader.py | 5 + modeling/patches.py | 19 ++++ 3 files changed, 79 insertions(+), 39 deletions(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 81a33c70..9a1b872e 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -89,6 +89,12 @@ class model_backend(HFTorchInferenceModel): return bool(gptq_model) def _load(self, save_model: bool, initial_load: bool) -> None: + try: + import hf_bleeding_edge + from hf_bleeding_edge import AutoModelForCausalLM + except ImportError: + from transformers import AutoModelForCausalLM + # Make model path the same as the model name to make this consistent # with the other loading method if it isn't a known model type. This # code is not just a workaround for below, it is also used to make the @@ -98,7 +104,7 @@ class model_backend(HFTorchInferenceModel): self.init_model_config() - self.lazy_load = False + self.lazy_load = True gpulayers = self.breakmodel_config.gpu_blocks @@ -181,50 +187,60 @@ class model_backend(HFTorchInferenceModel): model_type = self.get_model_type() logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}") - if model_type == "gptj": - model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "gpt_neox": - model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "llama": - model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "opt": - model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "mpt": - model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "gpt_bigcode": - model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half() - else: - try: - import auto_gptq - from auto_gptq import AutoGPTQForCausalLM - except ImportError: - raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") - try: - import hf_bleeding_edge - from hf_bleeding_edge import AutoModelForCausalLM - except ImportError: - from transformers import AutoModelForCausalLM - # Monkey patch in hf_bleeding_edge to avoid having to trust remote code - auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig - auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig - auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM - model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors")) + with lazy_loader.use_lazy_load( + enable=self.lazy_load, + dematerialized_modules=False, + ): + print(self.lazy_load) + if model_type == "gptj": + model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + elif model_type == "gpt_neox": + model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + elif model_type == "llama": + print("LLLLLAAAMMMAA") + print(torch.load) + model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + elif model_type == "opt": + model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + elif model_type == "mpt": + model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + elif model_type == "gpt_bigcode": + model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half() + else: + try: + import auto_gptq + from auto_gptq import AutoGPTQForCausalLM + except ImportError: + raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") - # Patch in embeddings function - def get_input_embeddings(self): - return self.model.get_input_embeddings() + try: + import hf_bleeding_edge + from hf_bleeding_edge import AutoModelForCausalLM + except ImportError: + from transformers import AutoModelForCausalLM - type(model).get_input_embeddings = get_input_embeddings + # Monkey patch in hf_bleeding_edge to avoid having to trust remote code + auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig + auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig + auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM - # Patch in args support.. - def generate(self, *args, **kwargs): - """shortcut for model.generate""" - with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type): - return self.model.generate(*args, **kwargs) + model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors")) - type(model).generate = generate + # Patch in embeddings function + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + type(model).get_input_embeddings = get_input_embeddings + + # Patch in args support.. + def generate(self, *args, **kwargs): + """shortcut for model.generate""" + with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type): + return self.model.generate(*args, **kwargs) + + type(model).generate = generate return model diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 69e0d948..8fff59d3 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -358,16 +358,19 @@ def safetensors_load_tensor_independently( ) -> torch.Tensor: """A hacky way to load a tensor by itself and not mmap every single tensor or whatever is causing that big memory spike""" + print("[ld]", tensor_key) with safetensors.safe_open(checkpoint_file, framework="pt", device=device) as f: return f.get_tensor(tensor_key) def patch_safetensors(callback): + print("Hi! We are patching safetensors") # Safetensors load patch import transformers def safetensors_load(checkpoint_file: str) -> dict: + print("LOAD NOW", safetensors_load) # Monkeypatch applied to safetensors.torch.load_file if utils.koboldai_vars.hascuda: @@ -409,6 +412,7 @@ def patch_safetensors(callback): return tensors transformers.modeling_utils.safe_load_file = safetensors_load + safetensors.torch.load_file = safetensors_load @contextlib.contextmanager @@ -520,6 +524,7 @@ def use_lazy_load( old_torch_load = torch.load def torch_load(f, map_location=None, pickle_module=pickle, **pickle_load_args): + print("TORCHLOAD", f) model_dict = old_torch_load( f=f, map_location=map_location, diff --git a/modeling/patches.py b/modeling/patches.py index 6e2168f2..f5b6bd06 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -129,15 +129,34 @@ def patch_transformers_generation() -> None: class LazyloadPatches: + class StateDictFacade(dict): + def __init__(self, state_dict): + self.update(state_dict) + + def __getitem__(self, name): + return super().__getitem__(name).materialize(map_location="cuda:0") + old_load_state_dict = transformers.modeling_utils._load_state_dict_into_meta_model + torch_old_load_from_state_dict = torch.nn.Module._load_from_state_dict def __enter__() -> None: transformers.modeling_utils._load_state_dict_into_meta_model = ( LazyloadPatches._load_state_dict_into_meta_model ) + torch.nn.Module._load_from_state_dict = LazyloadPatches._torch_load_from_state_dict + # torch.nn.Module._load_from_state_dict = _agn def __exit__(exc_type, exc_value, exc_traceback) -> None: transformers.modeling_utils._load_state_dict_into_meta_model = LazyloadPatches.old_load_state_dict + torch.nn.Module._load_from_state_dict = LazyloadPatches.torch_old_load_from_state_dict + + def _torch_load_from_state_dict(self, state_dict, *args, **kwargs): + return LazyloadPatches.torch_old_load_from_state_dict( + self, + LazyloadPatches.StateDictFacade(state_dict), + *args, + **kwargs + ) def _load_state_dict_into_meta_model( model,