mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Cleanup
This commit is contained in:
@@ -100,8 +100,6 @@ def load_quant_offload_device_map(
|
|||||||
from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel
|
from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeModel
|
||||||
model = load_quant_func(model, checkpoint, wbits, groupsize, force_bias=force_bias)
|
model = load_quant_func(model, checkpoint, wbits, groupsize, force_bias=force_bias)
|
||||||
|
|
||||||
print(device_map)
|
|
||||||
|
|
||||||
m, layers, remaining = find_layers(model)
|
m, layers, remaining = find_layers(model)
|
||||||
type(m).non_offload_forward = type(m).forward
|
type(m).non_offload_forward = type(m).forward
|
||||||
|
|
||||||
@@ -120,7 +118,6 @@ def load_quant_offload_device_map(
|
|||||||
raise RuntimeError(f"Model type {type(m)} not supported by CPU offloader")
|
raise RuntimeError(f"Model type {type(m)} not supported by CPU offloader")
|
||||||
|
|
||||||
layers_done = len([1 for v in device_map.values() if v != "cpu"])
|
layers_done = len([1 for v in device_map.values() if v != "cpu"])
|
||||||
print("LDone", layers_done)
|
|
||||||
|
|
||||||
m.cpu_device = torch.device("cpu")
|
m.cpu_device = torch.device("cpu")
|
||||||
m.fast_offload = layers_done > len(layers) // 2
|
m.fast_offload = layers_done > len(layers) // 2
|
||||||
@@ -134,10 +131,6 @@ def load_quant_offload_device_map(
|
|||||||
if "layers" not in dir(m):
|
if "layers" not in dir(m):
|
||||||
m.layers = layers
|
m.layers = layers
|
||||||
|
|
||||||
print(len(layers))
|
|
||||||
print(len(device_map))
|
|
||||||
|
|
||||||
print(m.primary_gpu)
|
|
||||||
for i in range(len(layers)):
|
for i in range(len(layers)):
|
||||||
dev = None
|
dev = None
|
||||||
for key, device in device_map.items():
|
for key, device in device_map.items():
|
||||||
@@ -184,10 +177,6 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
except (ValueError, AttributeError):
|
except (ValueError, AttributeError):
|
||||||
self.gpu_layers_list = [utils.num_layers(self.model_config)]
|
self.gpu_layers_list = [utils.num_layers(self.model_config)]
|
||||||
|
|
||||||
tf_kwargs = {
|
|
||||||
"low_cpu_mem_usage": True,
|
|
||||||
}
|
|
||||||
|
|
||||||
# If we're using torch_lazy_loader, we need to get breakmodel config
|
# If we're using torch_lazy_loader, we need to get breakmodel config
|
||||||
# early so that it knows where to load the individual model tensors
|
# early so that it knows where to load the individual model tensors
|
||||||
logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
|
logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
|
||||||
@@ -200,9 +189,6 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
self.breakmodel_device_config(self.model_config)
|
self.breakmodel_device_config(self.model_config)
|
||||||
|
|
||||||
if self.lazy_load:
|
if self.lazy_load:
|
||||||
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
|
||||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
|
||||||
|
|
||||||
# If we're using lazy loader, we need to figure out what the model's hidden layers are called
|
# If we're using lazy loader, we need to figure out what the model's hidden layers are called
|
||||||
with lazy_loader.use_lazy_load(dematerialized_modules=True):
|
with lazy_loader.use_lazy_load(dematerialized_modules=True):
|
||||||
try:
|
try:
|
||||||
@@ -218,7 +204,7 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
|
|
||||||
if self.get_local_model_path():
|
if self.get_local_model_path():
|
||||||
# Model is stored locally, load it.
|
# Model is stored locally, load it.
|
||||||
self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
|
self.model = self._get_model(self.get_local_model_path())
|
||||||
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
|
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("GPTQ Model downloading not implemented")
|
raise NotImplementedError("GPTQ Model downloading not implemented")
|
||||||
@@ -238,17 +224,9 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
self.model.kai_model = self
|
self.model.kai_model = self
|
||||||
utils.koboldai_vars.modeldim = self.get_hidden_size()
|
utils.koboldai_vars.modeldim = self.get_hidden_size()
|
||||||
|
|
||||||
def _patch_quant(self, device_map) -> None:
|
def _patch_quant(self, device_map, quant_module) -> None:
|
||||||
# QuantLinear loads on the CPU by default, using a lot of RAM! If we
|
def make_quant(module, names, bits, groupsize, name='', force_bias=False, **kwargs):
|
||||||
# load it to the same device that the weights are gonna be on, it
|
if isinstance(module, quant_module.QuantLinear):
|
||||||
# mysteriously uses no additional VRAM
|
|
||||||
|
|
||||||
from gptq import quant_v3
|
|
||||||
from gptq import quant_v2
|
|
||||||
from gptq import quant_v1
|
|
||||||
|
|
||||||
def make_quant(module, names, bits, groupsize, name='', force_bias=False):
|
|
||||||
if isinstance(module, quant_v3.QuantLinear):
|
|
||||||
return
|
return
|
||||||
|
|
||||||
for attr in dir(module):
|
for attr in dir(module):
|
||||||
@@ -264,19 +242,17 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
break
|
break
|
||||||
|
|
||||||
if device is None:
|
if device is None:
|
||||||
print(name1)
|
raise ValueError(f"No device for {name1}")
|
||||||
print(device_map)
|
|
||||||
raise ValueError
|
|
||||||
|
|
||||||
print("[ql]", name1, device)
|
|
||||||
delattr(module, attr)
|
delattr(module, attr)
|
||||||
|
|
||||||
ql = quant_v3.QuantLinear(
|
ql = quant_module.QuantLinear(
|
||||||
bits,
|
bits,
|
||||||
groupsize,
|
groupsize,
|
||||||
tmp.in_features,
|
tmp.in_features,
|
||||||
tmp.out_features,
|
tmp.out_features,
|
||||||
force_bias or tmp.bias is not None
|
force_bias or tmp.bias is not None,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
ql = ql.to(device)
|
ql = ql.to(device)
|
||||||
|
|
||||||
@@ -285,19 +261,21 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
for name1, child in module.named_children():
|
for name1, child in module.named_children():
|
||||||
make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1, force_bias=force_bias)
|
make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1, force_bias=force_bias)
|
||||||
|
|
||||||
quant_v3.make_quant = make_quant
|
quant_module.make_quant = make_quant
|
||||||
|
|
||||||
# def _ql_init_(self, *args, **kwargs):
|
|
||||||
# ret = type(self)._unpatched_init(self, *args, **kwargs)
|
|
||||||
# self.to("cuda:0")
|
|
||||||
# return ret
|
|
||||||
|
|
||||||
# for quant_module in [quant_v3, quant_v2, quant_v1]:
|
|
||||||
# quant_module.QuantLinear._unpatched_init = quant_module.QuantLinear.__init__
|
|
||||||
# quant_module.QuantLinear.__init__ = _ql_init_
|
|
||||||
|
|
||||||
|
|
||||||
def _get_model(self, location: str, tf_kwargs: Dict):
|
def _patch_quants(self, device_map) -> None:
|
||||||
|
# Load QuantLinears on the device corresponding to the device map
|
||||||
|
|
||||||
|
from gptq import quant_v3
|
||||||
|
from gptq import quant_v2
|
||||||
|
from gptq import quant_v1
|
||||||
|
|
||||||
|
for quant_module in [quant_v3, quant_v2, quant_v1]:
|
||||||
|
self._patch_quant(device_map, quant_module)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_model(self, location: str):
|
||||||
import gptq
|
import gptq
|
||||||
from gptq.gptj import load_quant as gptj_load_quant
|
from gptq.gptj import load_quant as gptj_load_quant
|
||||||
from gptq.gptneox import load_quant as gptneox_load_quant
|
from gptq.gptneox import load_quant as gptneox_load_quant
|
||||||
@@ -339,7 +317,7 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
metamodel
|
metamodel
|
||||||
)
|
)
|
||||||
|
|
||||||
self._patch_quant(device_map)
|
self._patch_quants(device_map)
|
||||||
|
|
||||||
with lazy_loader.use_lazy_load(
|
with lazy_loader.use_lazy_load(
|
||||||
enable=self.lazy_load,
|
enable=self.lazy_load,
|
||||||
@@ -350,9 +328,6 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
elif model_type == "gpt_neox":
|
elif model_type == "gpt_neox":
|
||||||
model = load_quant_offload_device_map(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
|
model = load_quant_offload_device_map(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
|
||||||
elif model_type == "llama":
|
elif model_type == "llama":
|
||||||
print("YE LAMA")
|
|
||||||
|
|
||||||
# model = llama_load_quant(location, gptq_file, gptq_bits, gptq_groupsize, force_bias=v2_bias)
|
|
||||||
model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
|
model = load_quant_offload_device_map(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
|
||||||
elif model_type == "opt":
|
elif model_type == "opt":
|
||||||
model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
|
model = load_quant_offload_device_map(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, device_map, force_bias=v2_bias)
|
||||||
|
Reference in New Issue
Block a user