From a6aafb252534b26bbdf034788895c3317b4cdd53 Mon Sep 17 00:00:00 2001
From: somebody <onesomegit@outlook.com>
Date: Mon, 24 Jul 2023 13:07:30 -0500
Subject: [PATCH] GPTQ: Patch QuantLinear to not use CPU RAM

---
 .../inference_models/gptq_hf_torch/class.py   | 23 ++++++++++++++++++-
 modeling/lazy_loader.py                       |  3 +--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py
index 9a1b872e..d942a539 100644
--- a/modeling/inference_models/gptq_hf_torch/class.py
+++ b/modeling/inference_models/gptq_hf_torch/class.py
@@ -7,7 +7,7 @@ import torch
 import re
 import shutil
 import sys
-from typing import Union
+from typing import Dict, Union
 
 import utils
 import modeling.lazy_loader as lazy_loader
@@ -167,6 +167,25 @@ class model_backend(HFTorchInferenceModel):
         self.model.kai_model = self
         utils.koboldai_vars.modeldim = self.get_hidden_size()
 
+    def _patch_quant(self) -> None:
+        # QuantLinear loads on the CPU by default, using a lot of RAM! If we
+        # load it to the same device that the weights are gonna be on, it
+        # mysteriously uses no additional VRAM
+
+        from gptq import quant_v3
+        from gptq import quant_v2
+        from gptq import quant_v1
+
+        def _ql_init_(self, *args, **kwargs):
+            ret = type(self)._unpatched_init(self, *args, **kwargs)
+            self.to("cuda:0")
+            return ret
+
+        for quant_module in [quant_v3, quant_v2, quant_v1]:
+            quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__
+            quant_module.QuantLinear.__init__ = _ql_init_
+
+
     def _get_model(self, location: str, tf_kwargs: Dict):
         import gptq
         from gptq.gptj import load_quant as gptj_load_quant
@@ -177,6 +196,8 @@ class model_backend(HFTorchInferenceModel):
         from gptq.mpt import load_quant as mpt_load_quant
         from gptq.offload import load_quant_offload
 
+        self._patch_quant()
+
         gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
         v2_bias = False
 
diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py
index 8fff59d3..a5e7c58f 100644
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -358,7 +358,6 @@ def safetensors_load_tensor_independently(
 ) -> torch.Tensor:
     """A hacky way to load a tensor by itself and not mmap every single tensor
     or whatever is causing that big memory spike"""
-    print("[ld]", tensor_key)
 
     with safetensors.safe_open(checkpoint_file, framework="pt", device=device) as f:
         return f.get_tensor(tensor_key)
@@ -379,7 +378,7 @@ def patch_safetensors(callback):
             # (70 tensors/s -> 65 tensor/s). The memory savings probably
             # shouldn't be the happening, maybe there's a memory leak
             # somewhere in our pipeline with CPU tensors.
-            intermediary_device = "cuda"
+            intermediary_device = "cuda:0"
         else:
             intermediary_device = "cpu"