From 1df03d9a27b86a086abafe80aefc0db67aa8e3f0 Mon Sep 17 00:00:00 2001 From: somebody Date: Sun, 23 Jul 2023 20:54:04 -0500 Subject: [PATCH 1/9] Basic --- .../inference_models/gptq_hf_torch/class.py | 94 +++++++++++-------- modeling/lazy_loader.py | 5 + modeling/patches.py | 19 ++++ 3 files changed, 79 insertions(+), 39 deletions(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 81a33c70..9a1b872e 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -89,6 +89,12 @@ class model_backend(HFTorchInferenceModel): return bool(gptq_model) def _load(self, save_model: bool, initial_load: bool) -> None: + try: + import hf_bleeding_edge + from hf_bleeding_edge import AutoModelForCausalLM + except ImportError: + from transformers import AutoModelForCausalLM + # Make model path the same as the model name to make this consistent # with the other loading method if it isn't a known model type. This # code is not just a workaround for below, it is also used to make the @@ -98,7 +104,7 @@ class model_backend(HFTorchInferenceModel): self.init_model_config() - self.lazy_load = False + self.lazy_load = True gpulayers = self.breakmodel_config.gpu_blocks @@ -181,50 +187,60 @@ class model_backend(HFTorchInferenceModel): model_type = self.get_model_type() logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}") - if model_type == "gptj": - model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "gpt_neox": - model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "llama": - model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "opt": - model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "mpt": - model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) - elif model_type == "gpt_bigcode": - model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half() - else: - try: - import auto_gptq - from auto_gptq import AutoGPTQForCausalLM - except ImportError: - raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") - try: - import hf_bleeding_edge - from hf_bleeding_edge import AutoModelForCausalLM - except ImportError: - from transformers import AutoModelForCausalLM - # Monkey patch in hf_bleeding_edge to avoid having to trust remote code - auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig - auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig - auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM - model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors")) + with lazy_loader.use_lazy_load( + enable=self.lazy_load, + dematerialized_modules=False, + ): + print(self.lazy_load) + if model_type == "gptj": + model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + elif model_type == "gpt_neox": + model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + elif model_type == "llama": + print("LLLLLAAAMMMAA") + print(torch.load) + model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + elif model_type == "opt": + model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + elif model_type == "mpt": + model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + elif model_type == "gpt_bigcode": + model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half() + else: + try: + import auto_gptq + from auto_gptq import AutoGPTQForCausalLM + except ImportError: + raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") - # Patch in embeddings function - def get_input_embeddings(self): - return self.model.get_input_embeddings() + try: + import hf_bleeding_edge + from hf_bleeding_edge import AutoModelForCausalLM + except ImportError: + from transformers import AutoModelForCausalLM - type(model).get_input_embeddings = get_input_embeddings + # Monkey patch in hf_bleeding_edge to avoid having to trust remote code + auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig + auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig + auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM - # Patch in args support.. - def generate(self, *args, **kwargs): - """shortcut for model.generate""" - with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type): - return self.model.generate(*args, **kwargs) + model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors")) - type(model).generate = generate + # Patch in embeddings function + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + type(model).get_input_embeddings = get_input_embeddings + + # Patch in args support.. + def generate(self, *args, **kwargs): + """shortcut for model.generate""" + with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type): + return self.model.generate(*args, **kwargs) + + type(model).generate = generate return model diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 69e0d948..8fff59d3 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -358,16 +358,19 @@ def safetensors_load_tensor_independently( ) -> torch.Tensor: """A hacky way to load a tensor by itself and not mmap every single tensor or whatever is causing that big memory spike""" + print("[ld]", tensor_key) with safetensors.safe_open(checkpoint_file, framework="pt", device=device) as f: return f.get_tensor(tensor_key) def patch_safetensors(callback): + print("Hi! We are patching safetensors") # Safetensors load patch import transformers def safetensors_load(checkpoint_file: str) -> dict: + print("LOAD NOW", safetensors_load) # Monkeypatch applied to safetensors.torch.load_file if utils.koboldai_vars.hascuda: @@ -409,6 +412,7 @@ def patch_safetensors(callback): return tensors transformers.modeling_utils.safe_load_file = safetensors_load + safetensors.torch.load_file = safetensors_load @contextlib.contextmanager @@ -520,6 +524,7 @@ def use_lazy_load( old_torch_load = torch.load def torch_load(f, map_location=None, pickle_module=pickle, **pickle_load_args): + print("TORCHLOAD", f) model_dict = old_torch_load( f=f, map_location=map_location, diff --git a/modeling/patches.py b/modeling/patches.py index 6e2168f2..f5b6bd06 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -129,15 +129,34 @@ def patch_transformers_generation() -> None: class LazyloadPatches: + class StateDictFacade(dict): + def __init__(self, state_dict): + self.update(state_dict) + + def __getitem__(self, name): + return super().__getitem__(name).materialize(map_location="cuda:0") + old_load_state_dict = transformers.modeling_utils._load_state_dict_into_meta_model + torch_old_load_from_state_dict = torch.nn.Module._load_from_state_dict def __enter__() -> None: transformers.modeling_utils._load_state_dict_into_meta_model = ( LazyloadPatches._load_state_dict_into_meta_model ) + torch.nn.Module._load_from_state_dict = LazyloadPatches._torch_load_from_state_dict + # torch.nn.Module._load_from_state_dict = _agn def __exit__(exc_type, exc_value, exc_traceback) -> None: transformers.modeling_utils._load_state_dict_into_meta_model = LazyloadPatches.old_load_state_dict + torch.nn.Module._load_from_state_dict = LazyloadPatches.torch_old_load_from_state_dict + + def _torch_load_from_state_dict(self, state_dict, *args, **kwargs): + return LazyloadPatches.torch_old_load_from_state_dict( + self, + LazyloadPatches.StateDictFacade(state_dict), + *args, + **kwargs + ) def _load_state_dict_into_meta_model( model, From a6aafb252534b26bbdf034788895c3317b4cdd53 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 24 Jul 2023 13:07:30 -0500 Subject: [PATCH 2/9] GPTQ: Patch QuantLinear to not use CPU RAM --- .../inference_models/gptq_hf_torch/class.py | 23 ++++++++++++++++++- modeling/lazy_loader.py | 3 +-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 9a1b872e..d942a539 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -7,7 +7,7 @@ import torch import re import shutil import sys -from typing import Union +from typing import Dict, Union import utils import modeling.lazy_loader as lazy_loader @@ -167,6 +167,25 @@ class model_backend(HFTorchInferenceModel): self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() + def _patch_quant(self) -> None: + # QuantLinear loads on the CPU by default, using a lot of RAM! If we + # load it to the same device that the weights are gonna be on, it + # mysteriously uses no additional VRAM + + from gptq import quant_v3 + from gptq import quant_v2 + from gptq import quant_v1 + + def _ql_init_(self, *args, **kwargs): + ret = type(self)._unpatched_init(self, *args, **kwargs) + self.to("cuda:0") + return ret + + for quant_module in [quant_v3, quant_v2, quant_v1]: + quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__ + quant_module.QuantLinear.__init__ = _ql_init_ + + def _get_model(self, location: str, tf_kwargs: Dict): import gptq from gptq.gptj import load_quant as gptj_load_quant @@ -177,6 +196,8 @@ class model_backend(HFTorchInferenceModel): from gptq.mpt import load_quant as mpt_load_quant from gptq.offload import load_quant_offload + self._patch_quant() + gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location) v2_bias = False diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 8fff59d3..a5e7c58f 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -358,7 +358,6 @@ def safetensors_load_tensor_independently( ) -> torch.Tensor: """A hacky way to load a tensor by itself and not mmap every single tensor or whatever is causing that big memory spike""" - print("[ld]", tensor_key) with safetensors.safe_open(checkpoint_file, framework="pt", device=device) as f: return f.get_tensor(tensor_key) @@ -379,7 +378,7 @@ def patch_safetensors(callback): # (70 tensors/s -> 65 tensor/s). The memory savings probably # shouldn't be the happening, maybe there's a memory leak # somewhere in our pipeline with CPU tensors. - intermediary_device = "cuda" + intermediary_device = "cuda:0" else: intermediary_device = "cpu" From 4a6cccb00227561454e395b796aada44a60b05cf Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 24 Jul 2023 13:09:15 -0500 Subject: [PATCH 3/9] Import fix --- modeling/inference_models/gptq_hf_torch/class.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index d942a539..499b2682 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -90,7 +90,6 @@ class model_backend(HFTorchInferenceModel): def _load(self, save_model: bool, initial_load: bool) -> None: try: - import hf_bleeding_edge from hf_bleeding_edge import AutoModelForCausalLM except ImportError: from transformers import AutoModelForCausalLM From 929917efe9bb51aa4fe2147f6813205908efb3f6 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 24 Jul 2023 13:09:43 -0500 Subject: [PATCH 4/9] Remove shrieking --- modeling/inference_models/gptq_hf_torch/class.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 499b2682..74f11e18 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -208,19 +208,15 @@ class model_backend(HFTorchInferenceModel): logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}") - with lazy_loader.use_lazy_load( enable=self.lazy_load, dematerialized_modules=False, ): - print(self.lazy_load) if model_type == "gptj": model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) elif model_type == "gpt_neox": model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) elif model_type == "llama": - print("LLLLLAAAMMMAA") - print(torch.load) model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) elif model_type == "opt": model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) From 43a4abaf6320cc86e244cf103cc93b520339550e Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 24 Jul 2023 13:10:33 -0500 Subject: [PATCH 5/9] Remove even more debug --- modeling/lazy_loader.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index a5e7c58f..74770a1c 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -176,9 +176,6 @@ class TorchLazyTensor(LazyTensor): CheckpointChunkCache.key = self.key ziproot = checkpoint.namelist()[0].split("/")[0] CheckpointChunkCache.handle = checkpoint.open(f"{ziproot}/data/{self.key}", "r") - - - else: # Cache hit. Hip hip hooray! :^) # print(".", end="", flush=True) @@ -318,7 +315,6 @@ class _LazyUnpickler(RestrictedUnpickler): lazy_loaded_storages: Dict[str, LazyTensor] def __init__(self, *args, **kwargs): - # print(args, kwargs) self.lazy_loaded_storages = {} return super().__init__(*args, **kwargs) @@ -364,12 +360,10 @@ def safetensors_load_tensor_independently( def patch_safetensors(callback): - print("Hi! We are patching safetensors") # Safetensors load patch import transformers def safetensors_load(checkpoint_file: str) -> dict: - print("LOAD NOW", safetensors_load) # Monkeypatch applied to safetensors.torch.load_file if utils.koboldai_vars.hascuda: @@ -523,7 +517,6 @@ def use_lazy_load( old_torch_load = torch.load def torch_load(f, map_location=None, pickle_module=pickle, **pickle_load_args): - print("TORCHLOAD", f) model_dict = old_torch_load( f=f, map_location=map_location, From 34aa333c44a16c38ce586efc0fb2118da0c20b0e Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 24 Jul 2023 13:11:06 -0500 Subject: [PATCH 6/9] Last debug --- modeling/patches.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modeling/patches.py b/modeling/patches.py index f5b6bd06..5664ec07 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -144,7 +144,6 @@ class LazyloadPatches: LazyloadPatches._load_state_dict_into_meta_model ) torch.nn.Module._load_from_state_dict = LazyloadPatches._torch_load_from_state_dict - # torch.nn.Module._load_from_state_dict = _agn def __exit__(exc_type, exc_value, exc_traceback) -> None: transformers.modeling_utils._load_state_dict_into_meta_model = LazyloadPatches.old_load_state_dict From a73420c49c1371c49b59816d3122d6e6d4f3b676 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 24 Jul 2023 17:15:59 -0500 Subject: [PATCH 7/9] really really really sketchy breakmodel implementation im gonna go lie down for an extended period of time --- .../inference_models/gptq_hf_torch/class.py | 175 +++++++++++++++--- 1 file changed, 153 insertions(+), 22 deletions(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 74f11e18..45d18f7b 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -82,6 +82,79 @@ def get_gptq_version(fpath): logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}") return 0, False +def load_quant_offload_device_map( + load_quant_func, model, checkpoint, wbits, groupsize, device_map, offload_type=0, force_bias=False, +): + from gptq.offload import ( + find_layers, + llama_offload_forward, + gptneox_offload_forward, + gptj_offload_forward, + opt_offload_forward, + bigcode_offload_forward + ) + from transformers.models.llama.modeling_llama import LlamaModel + from transformers.models.opt.modeling_opt import OPTModel + from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel + from transformers.models.gptj.modeling_gptj import GPTJModel + from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel + model = load_quant_func(model, checkpoint, wbits, groupsize, force_bias=force_bias) + + print(device_map) + + m, layers, remaining = find_layers(model) + + type(m).non_offload_forward = type(m).forward + + # Hook offload_forward into found model + if type(m) == LlamaModel: + type(m).forward = llama_offload_forward + elif type(m) == GPTNeoXModel: + type(m).forward = gptneox_offload_forward + elif type(m) == GPTJModel: + type(m).forward = gptj_offload_forward + elif type(m) == OPTModel: + type(m).forward = opt_offload_forward + elif type(m) == GPTBigCodeModel: + type(m).forward = bigcode_offload_forward + else: + raise RuntimeError(f"Model type {type(m)} not supported by CPU offloader") + + layers_done = len([1 for v in device_map.values() if v != "cpu"]) + print("LDone", layers_done) + + m.cpu_device = torch.device("cpu") + m.fast_offload = layers_done > len(layers) // 2 + m.layer_count = len(layers) + m.cpu_layers = len(layers) - layers_done + m.gpu_layers = layers_done + m.offload_type = offload_type + # HACK + m.primary_gpu = list(device_map.values())[0] + + if "layers" not in dir(m): + m.layers = layers + + print(len(layers)) + print(len(device_map)) + + print(m.primary_gpu) + for i in range(len(layers)): + dev = None + for key, device in device_map.items(): + key = int(*[x for x in key.split(".") if x.isdecimal()]) + if key == i: + dev = device + break + if dev is None: + raise ValueError + layers[key].to(dev, torch.float16, False) + + for module in remaining: + module.to(m.primary_gpu) + + return model + class model_backend(HFTorchInferenceModel): def is_valid(self, model_name, model_path, menu_path): @@ -166,7 +239,7 @@ class model_backend(HFTorchInferenceModel): self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() - def _patch_quant(self) -> None: + def _patch_quant(self, device_map) -> None: # QuantLinear loads on the CPU by default, using a lot of RAM! If we # load it to the same device that the weights are gonna be on, it # mysteriously uses no additional VRAM @@ -175,14 +248,54 @@ class model_backend(HFTorchInferenceModel): from gptq import quant_v2 from gptq import quant_v1 - def _ql_init_(self, *args, **kwargs): - ret = type(self)._unpatched_init(self, *args, **kwargs) - self.to("cuda:0") - return ret + def make_quant(module, names, bits, groupsize, name='', force_bias=False): + if isinstance(module, quant_v3.QuantLinear): + return - for quant_module in [quant_v3, quant_v2, quant_v1]: - quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__ - quant_module.QuantLinear.__init__ = _ql_init_ + for attr in dir(module): + tmp = getattr(module, attr) + name1 = name + '.' + attr if name != '' else attr + if name1 in names: + parts = name1.split(".") + device = None + for i in reversed(range(len(parts))): + maybe_key = ".".join(parts[:i]) + if maybe_key in device_map: + device = device_map[maybe_key] + break + + if device is None: + print(name1) + print(device_map) + raise ValueError + + print("[ql]", name1, device) + delattr(module, attr) + + ql = quant_v3.QuantLinear( + bits, + groupsize, + tmp.in_features, + tmp.out_features, + force_bias or tmp.bias is not None + ) + ql = ql.to(device) + + setattr(module, attr, ql) + + for name1, child in module.named_children(): + make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1, force_bias=force_bias) + + quant_v3.make_quant = make_quant + + # def _ql_init_(self, *args, **kwargs): + # ret = type(self)._unpatched_init(self, *args, **kwargs) + # self.to("cuda:0") + # return ret + + # for quant_module in [quant_v3, quant_v2, quant_v1]: + # quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__ + # quant_module.QuantLinear.__init__ = _ql_init_ def _get_model(self, location: str, tf_kwargs: Dict): @@ -193,9 +306,12 @@ class model_backend(HFTorchInferenceModel): from gptq.opt import load_quant as opt_load_quant from gptq.bigcode import load_quant as bigcode_load_quant from gptq.mpt import load_quant as mpt_load_quant - from gptq.offload import load_quant_offload - self._patch_quant() + try: + import hf_bleeding_edge + from hf_bleeding_edge import AutoModelForCausalLM + except ImportError: + from transformers import AutoModelForCausalLM gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location) v2_bias = False @@ -208,22 +324,43 @@ class model_backend(HFTorchInferenceModel): logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}") + device_map = {} + + if self.lazy_load: + with lazy_loader.use_lazy_load(dematerialized_modules=True): + metamodel = AutoModelForCausalLM.from_config(self.model_config) + if utils.args.cpu: + device_map = {name: "cpu" for name in utils.layers_module_names} + for name in utils.get_missing_module_names( + metamodel, list(device_map.keys()) + ): + device_map[name] = "cpu" + else: + device_map = self.breakmodel_config.get_device_map( + metamodel + ) + + self._patch_quant(device_map) + with lazy_loader.use_lazy_load( enable=self.lazy_load, dematerialized_modules=False, ): if model_type == "gptj": - model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + model = load_quant_offload_device_map(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "gpt_neox": - model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + model = load_quant_offload_device_map(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "llama": - model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + print("YE LAMA") + + # model = llama_load_quant(location, gptq_file, gptq_bits, gptq_groupsize, force_bias=v2_bias) + model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "opt": - model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "mpt": - model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + model = load_quant_offload_device_map(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "gpt_bigcode": - model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half() + model = load_quant_offload_device_map(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias).half() else: try: import auto_gptq @@ -231,12 +368,6 @@ class model_backend(HFTorchInferenceModel): except ImportError: raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") - try: - import hf_bleeding_edge - from hf_bleeding_edge import AutoModelForCausalLM - except ImportError: - from transformers import AutoModelForCausalLM - # Monkey patch in hf_bleeding_edge to avoid having to trust remote code auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig From ad4528b5a6882e1bdb46e111be90d6f931090733 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 24 Jul 2023 17:17:57 -0500 Subject: [PATCH 8/9] critical change --- modeling/inference_models/gptq_hf_torch/class.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 45d18f7b..10349388 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -103,7 +103,6 @@ def load_quant_offload_device_map( print(device_map) m, layers, remaining = find_layers(model) - type(m).non_offload_forward = type(m).forward # Hook offload_forward into found model From c80de5120c3bfd28f5a4963eabd562915bc7d015 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 24 Jul 2023 19:45:33 -0500 Subject: [PATCH 9/9] Cleanup --- .../inference_models/gptq_hf_torch/class.py | 69 ++++++------------- 1 file changed, 22 insertions(+), 47 deletions(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 10349388..6fae6779 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -100,8 +100,6 @@ def load_quant_offload_device_map( from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel model = load_quant_func(model, checkpoint, wbits, groupsize, force_bias=force_bias) - print(device_map) - m, layers, remaining = find_layers(model) type(m).non_offload_forward = type(m).forward @@ -120,7 +118,6 @@ def load_quant_offload_device_map( raise RuntimeError(f"Model type {type(m)} not supported by CPU offloader") layers_done = len([1 for v in device_map.values() if v != "cpu"]) - print("LDone", layers_done) m.cpu_device = torch.device("cpu") m.fast_offload = layers_done > len(layers) // 2 @@ -134,10 +131,6 @@ def load_quant_offload_device_map( if "layers" not in dir(m): m.layers = layers - print(len(layers)) - print(len(device_map)) - - print(m.primary_gpu) for i in range(len(layers)): dev = None for key, device in device_map.items(): @@ -184,10 +177,6 @@ class model_backend(HFTorchInferenceModel): except (ValueError, AttributeError): self.gpu_layers_list = [utils.num_layers(self.model_config)] - tf_kwargs = { - "low_cpu_mem_usage": True, - } - # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel)) @@ -200,9 +189,6 @@ class model_backend(HFTorchInferenceModel): self.breakmodel_device_config(self.model_config) if self.lazy_load: - # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time - tf_kwargs.pop("low_cpu_mem_usage", None) - # If we're using lazy loader, we need to figure out what the model's hidden layers are called with lazy_loader.use_lazy_load(dematerialized_modules=True): try: @@ -218,7 +204,7 @@ class model_backend(HFTorchInferenceModel): if self.get_local_model_path(): # Model is stored locally, load it. - self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.model = self._get_model(self.get_local_model_path()) self.tokenizer = self._get_tokenizer(self.get_local_model_path()) else: raise NotImplementedError("GPTQ Model downloading not implemented") @@ -238,17 +224,9 @@ class model_backend(HFTorchInferenceModel): self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() - def _patch_quant(self, device_map) -> None: - # QuantLinear loads on the CPU by default, using a lot of RAM! If we - # load it to the same device that the weights are gonna be on, it - # mysteriously uses no additional VRAM - - from gptq import quant_v3 - from gptq import quant_v2 - from gptq import quant_v1 - - def make_quant(module, names, bits, groupsize, name='', force_bias=False): - if isinstance(module, quant_v3.QuantLinear): + def _patch_quant(self, device_map, quant_module) -> None: + def make_quant(module, names, bits, groupsize, name='', force_bias=False, **kwargs): + if isinstance(module, quant_module.QuantLinear): return for attr in dir(module): @@ -264,19 +242,17 @@ class model_backend(HFTorchInferenceModel): break if device is None: - print(name1) - print(device_map) - raise ValueError + raise ValueError(f"No device for {name1}") - print("[ql]", name1, device) delattr(module, attr) - ql = quant_v3.QuantLinear( + ql = quant_module.QuantLinear( bits, groupsize, tmp.in_features, tmp.out_features, - force_bias or tmp.bias is not None + force_bias or tmp.bias is not None, + **kwargs, ) ql = ql.to(device) @@ -285,19 +261,21 @@ class model_backend(HFTorchInferenceModel): for name1, child in module.named_children(): make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1, force_bias=force_bias) - quant_v3.make_quant = make_quant - - # def _ql_init_(self, *args, **kwargs): - # ret = type(self)._unpatched_init(self, *args, **kwargs) - # self.to("cuda:0") - # return ret - - # for quant_module in [quant_v3, quant_v2, quant_v1]: - # quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__ - # quant_module.QuantLinear.__init__ = _ql_init_ + quant_module.make_quant = make_quant - def _get_model(self, location: str, tf_kwargs: Dict): + def _patch_quants(self, device_map) -> None: + # Load QuantLinears on the device corresponding to the device map + + from gptq import quant_v3 + from gptq import quant_v2 + from gptq import quant_v1 + + for quant_module in [quant_v3, quant_v2, quant_v1]: + self._patch_quant(device_map, quant_module) + + + def _get_model(self, location: str): import gptq from gptq.gptj import load_quant as gptj_load_quant from gptq.gptneox import load_quant as gptneox_load_quant @@ -339,7 +317,7 @@ class model_backend(HFTorchInferenceModel): metamodel ) - self._patch_quant(device_map) + self._patch_quants(device_map) with lazy_loader.use_lazy_load( enable=self.lazy_load, @@ -350,9 +328,6 @@ class model_backend(HFTorchInferenceModel): elif model_type == "gpt_neox": model = load_quant_offload_device_map(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "llama": - print("YE LAMA") - - # model = llama_load_quant(location, gptq_file, gptq_bits, gptq_groupsize, force_bias=v2_bias) model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "opt": model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)