mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
GPTQ: Patch QuantLinear to not use CPU RAM
This commit is contained in:
@@ -7,7 +7,7 @@ import torch
|
|||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
from typing import Union
|
from typing import Dict, Union
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
import modeling.lazy_loader as lazy_loader
|
import modeling.lazy_loader as lazy_loader
|
||||||
@@ -167,6 +167,25 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
self.model.kai_model = self
|
self.model.kai_model = self
|
||||||
utils.koboldai_vars.modeldim = self.get_hidden_size()
|
utils.koboldai_vars.modeldim = self.get_hidden_size()
|
||||||
|
|
||||||
|
def _patch_quant(self) -> None:
|
||||||
|
# QuantLinear loads on the CPU by default, using a lot of RAM! If we
|
||||||
|
# load it to the same device that the weights are gonna be on, it
|
||||||
|
# mysteriously uses no additional VRAM
|
||||||
|
|
||||||
|
from gptq import quant_v3
|
||||||
|
from gptq import quant_v2
|
||||||
|
from gptq import quant_v1
|
||||||
|
|
||||||
|
def _ql_init_(self, *args, **kwargs):
|
||||||
|
ret = type(self)._unpatched_init(self, *args, **kwargs)
|
||||||
|
self.to("cuda:0")
|
||||||
|
return ret
|
||||||
|
|
||||||
|
for quant_module in [quant_v3, quant_v2, quant_v1]:
|
||||||
|
quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__
|
||||||
|
quant_module.QuantLinear.__init__ = _ql_init_
|
||||||
|
|
||||||
|
|
||||||
def _get_model(self, location: str, tf_kwargs: Dict):
|
def _get_model(self, location: str, tf_kwargs: Dict):
|
||||||
import gptq
|
import gptq
|
||||||
from gptq.gptj import load_quant as gptj_load_quant
|
from gptq.gptj import load_quant as gptj_load_quant
|
||||||
@@ -177,6 +196,8 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
from gptq.mpt import load_quant as mpt_load_quant
|
from gptq.mpt import load_quant as mpt_load_quant
|
||||||
from gptq.offload import load_quant_offload
|
from gptq.offload import load_quant_offload
|
||||||
|
|
||||||
|
self._patch_quant()
|
||||||
|
|
||||||
gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
|
gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
|
||||||
v2_bias = False
|
v2_bias = False
|
||||||
|
|
||||||
|
@@ -358,7 +358,6 @@ def safetensors_load_tensor_independently(
|
|||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
"""A hacky way to load a tensor by itself and not mmap every single tensor
|
"""A hacky way to load a tensor by itself and not mmap every single tensor
|
||||||
or whatever is causing that big memory spike"""
|
or whatever is causing that big memory spike"""
|
||||||
print("[ld]", tensor_key)
|
|
||||||
|
|
||||||
with safetensors.safe_open(checkpoint_file, framework="pt", device=device) as f:
|
with safetensors.safe_open(checkpoint_file, framework="pt", device=device) as f:
|
||||||
return f.get_tensor(tensor_key)
|
return f.get_tensor(tensor_key)
|
||||||
@@ -379,7 +378,7 @@ def patch_safetensors(callback):
|
|||||||
# (70 tensors/s -> 65 tensor/s). The memory savings probably
|
# (70 tensors/s -> 65 tensor/s). The memory savings probably
|
||||||
# shouldn't be the happening, maybe there's a memory leak
|
# shouldn't be the happening, maybe there's a memory leak
|
||||||
# somewhere in our pipeline with CPU tensors.
|
# somewhere in our pipeline with CPU tensors.
|
||||||
intermediary_device = "cuda"
|
intermediary_device = "cuda:0"
|
||||||
else:
|
else:
|
||||||
intermediary_device = "cpu"
|
intermediary_device = "cpu"
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user