From a73420c49c1371c49b59816d3122d6e6d4f3b676 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 24 Jul 2023 17:15:59 -0500 Subject: [PATCH] really really really sketchy breakmodel implementation im gonna go lie down for an extended period of time --- .../inference_models/gptq_hf_torch/class.py | 175 +++++++++++++++--- 1 file changed, 153 insertions(+), 22 deletions(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 74f11e18..45d18f7b 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -82,6 +82,79 @@ def get_gptq_version(fpath): logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}") return 0, False +def load_quant_offload_device_map( + load_quant_func, model, checkpoint, wbits, groupsize, device_map, offload_type=0, force_bias=False, +): + from gptq.offload import ( + find_layers, + llama_offload_forward, + gptneox_offload_forward, + gptj_offload_forward, + opt_offload_forward, + bigcode_offload_forward + ) + from transformers.models.llama.modeling_llama import LlamaModel + from transformers.models.opt.modeling_opt import OPTModel + from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXModel + from transformers.models.gptj.modeling_gptj import GPTJModel + from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel + model = load_quant_func(model, checkpoint, wbits, groupsize, force_bias=force_bias) + + print(device_map) + + m, layers, remaining = find_layers(model) + + type(m).non_offload_forward = type(m).forward + + # Hook offload_forward into found model + if type(m) == LlamaModel: + type(m).forward = llama_offload_forward + elif type(m) == GPTNeoXModel: + type(m).forward = gptneox_offload_forward + elif type(m) == GPTJModel: + type(m).forward = gptj_offload_forward + elif type(m) == OPTModel: + type(m).forward = opt_offload_forward + elif type(m) == GPTBigCodeModel: + type(m).forward = bigcode_offload_forward + else: + raise RuntimeError(f"Model type {type(m)} not supported by CPU offloader") + + layers_done = len([1 for v in device_map.values() if v != "cpu"]) + print("LDone", layers_done) + + m.cpu_device = torch.device("cpu") + m.fast_offload = layers_done > len(layers) // 2 + m.layer_count = len(layers) + m.cpu_layers = len(layers) - layers_done + m.gpu_layers = layers_done + m.offload_type = offload_type + # HACK + m.primary_gpu = list(device_map.values())[0] + + if "layers" not in dir(m): + m.layers = layers + + print(len(layers)) + print(len(device_map)) + + print(m.primary_gpu) + for i in range(len(layers)): + dev = None + for key, device in device_map.items(): + key = int(*[x for x in key.split(".") if x.isdecimal()]) + if key == i: + dev = device + break + if dev is None: + raise ValueError + layers[key].to(dev, torch.float16, False) + + for module in remaining: + module.to(m.primary_gpu) + + return model + class model_backend(HFTorchInferenceModel): def is_valid(self, model_name, model_path, menu_path): @@ -166,7 +239,7 @@ class model_backend(HFTorchInferenceModel): self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() - def _patch_quant(self) -> None: + def _patch_quant(self, device_map) -> None: # QuantLinear loads on the CPU by default, using a lot of RAM! If we # load it to the same device that the weights are gonna be on, it # mysteriously uses no additional VRAM @@ -175,14 +248,54 @@ class model_backend(HFTorchInferenceModel): from gptq import quant_v2 from gptq import quant_v1 - def _ql_init_(self, *args, **kwargs): - ret = type(self)._unpatched_init(self, *args, **kwargs) - self.to("cuda:0") - return ret + def make_quant(module, names, bits, groupsize, name='', force_bias=False): + if isinstance(module, quant_v3.QuantLinear): + return - for quant_module in [quant_v3, quant_v2, quant_v1]: - quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__ - quant_module.QuantLinear.__init__ = _ql_init_ + for attr in dir(module): + tmp = getattr(module, attr) + name1 = name + '.' + attr if name != '' else attr + if name1 in names: + parts = name1.split(".") + device = None + for i in reversed(range(len(parts))): + maybe_key = ".".join(parts[:i]) + if maybe_key in device_map: + device = device_map[maybe_key] + break + + if device is None: + print(name1) + print(device_map) + raise ValueError + + print("[ql]", name1, device) + delattr(module, attr) + + ql = quant_v3.QuantLinear( + bits, + groupsize, + tmp.in_features, + tmp.out_features, + force_bias or tmp.bias is not None + ) + ql = ql.to(device) + + setattr(module, attr, ql) + + for name1, child in module.named_children(): + make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1, force_bias=force_bias) + + quant_v3.make_quant = make_quant + + # def _ql_init_(self, *args, **kwargs): + # ret = type(self)._unpatched_init(self, *args, **kwargs) + # self.to("cuda:0") + # return ret + + # for quant_module in [quant_v3, quant_v2, quant_v1]: + # quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__ + # quant_module.QuantLinear.__init__ = _ql_init_ def _get_model(self, location: str, tf_kwargs: Dict): @@ -193,9 +306,12 @@ class model_backend(HFTorchInferenceModel): from gptq.opt import load_quant as opt_load_quant from gptq.bigcode import load_quant as bigcode_load_quant from gptq.mpt import load_quant as mpt_load_quant - from gptq.offload import load_quant_offload - self._patch_quant() + try: + import hf_bleeding_edge + from hf_bleeding_edge import AutoModelForCausalLM + except ImportError: + from transformers import AutoModelForCausalLM gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location) v2_bias = False @@ -208,22 +324,43 @@ class model_backend(HFTorchInferenceModel): logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}") + device_map = {} + + if self.lazy_load: + with lazy_loader.use_lazy_load(dematerialized_modules=True): + metamodel = AutoModelForCausalLM.from_config(self.model_config) + if utils.args.cpu: + device_map = {name: "cpu" for name in utils.layers_module_names} + for name in utils.get_missing_module_names( + metamodel, list(device_map.keys()) + ): + device_map[name] = "cpu" + else: + device_map = self.breakmodel_config.get_device_map( + metamodel + ) + + self._patch_quant(device_map) + with lazy_loader.use_lazy_load( enable=self.lazy_load, dematerialized_modules=False, ): if model_type == "gptj": - model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + model = load_quant_offload_device_map(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "gpt_neox": - model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + model = load_quant_offload_device_map(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "llama": - model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + print("YE LAMA") + + # model = llama_load_quant(location, gptq_file, gptq_bits, gptq_groupsize, force_bias=v2_bias) + model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "opt": - model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "mpt": - model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) + model = load_quant_offload_device_map(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias) elif model_type == "gpt_bigcode": - model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half() + model = load_quant_offload_device_map(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias).half() else: try: import auto_gptq @@ -231,12 +368,6 @@ class model_backend(HFTorchInferenceModel): except ImportError: raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") - try: - import hf_bleeding_edge - from hf_bleeding_edge import AutoModelForCausalLM - except ImportError: - from transformers import AutoModelForCausalLM - # Monkey patch in hf_bleeding_edge to avoid having to trust remote code auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig