From a6aafb252534b26bbdf034788895c3317b4cdd53 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 24 Jul 2023 13:07:30 -0500 Subject: [PATCH] GPTQ: Patch QuantLinear to not use CPU RAM --- .../inference_models/gptq_hf_torch/class.py | 23 ++++++++++++++++++- modeling/lazy_loader.py | 3 +-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index 9a1b872e..d942a539 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -7,7 +7,7 @@ import torch import re import shutil import sys -from typing import Union +from typing import Dict, Union import utils import modeling.lazy_loader as lazy_loader @@ -167,6 +167,25 @@ class model_backend(HFTorchInferenceModel): self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() + def _patch_quant(self) -> None: + # QuantLinear loads on the CPU by default, using a lot of RAM! If we + # load it to the same device that the weights are gonna be on, it + # mysteriously uses no additional VRAM + + from gptq import quant_v3 + from gptq import quant_v2 + from gptq import quant_v1 + + def _ql_init_(self, *args, **kwargs): + ret = type(self)._unpatched_init(self, *args, **kwargs) + self.to("cuda:0") + return ret + + for quant_module in [quant_v3, quant_v2, quant_v1]: + quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__ + quant_module.QuantLinear.__init__ = _ql_init_ + + def _get_model(self, location: str, tf_kwargs: Dict): import gptq from gptq.gptj import load_quant as gptj_load_quant @@ -177,6 +196,8 @@ class model_backend(HFTorchInferenceModel): from gptq.mpt import load_quant as mpt_load_quant from gptq.offload import load_quant_offload + self._patch_quant() + gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location) v2_bias = False diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 8fff59d3..a5e7c58f 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -358,7 +358,6 @@ def safetensors_load_tensor_independently( ) -> torch.Tensor: """A hacky way to load a tensor by itself and not mmap every single tensor or whatever is causing that big memory spike""" - print("[ld]", tensor_key) with safetensors.safe_open(checkpoint_file, framework="pt", device=device) as f: return f.get_tensor(tensor_key) @@ -379,7 +378,7 @@ def patch_safetensors(callback): # (70 tensors/s -> 65 tensor/s). The memory savings probably # shouldn't be the happening, maybe there's a memory leak # somewhere in our pipeline with CPU tensors. - intermediary_device = "cuda" + intermediary_device = "cuda:0" else: intermediary_device = "cpu"