Only import big python modules for GPTQ once they get used

This commit is contained in:
0cc4m
2023-07-23 22:07:34 +02:00
parent 49740aa5ab
commit 973aea12ea

View File

@@ -9,13 +9,6 @@ import shutil
import sys import sys
from typing import Union from typing import Union
from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
try:
import hf_bleeding_edge
from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
from transformers import AutoModelForCausalLM
import utils import utils
import modeling.lazy_loader as lazy_loader import modeling.lazy_loader as lazy_loader
import koboldai_settings import koboldai_settings
@@ -24,23 +17,7 @@ from logger import logger, set_logger_verbosity
from modeling.inference_models.hf_torch import HFTorchInferenceModel from modeling.inference_models.hf_torch import HFTorchInferenceModel
from modeling.tokenizer import GenericTokenizer from modeling.tokenizer import GenericTokenizer
# 4-bit dependencies
import gptq
from pathlib import Path from pathlib import Path
from gptq.gptj import load_quant as gptj_load_quant
from gptq.gptneox import load_quant as gptneox_load_quant
from gptq.llama import load_quant as llama_load_quant
from gptq.opt import load_quant as opt_load_quant
from gptq.bigcode import load_quant as bigcode_load_quant
from gptq.mpt import load_quant as mpt_load_quant
from gptq.offload import load_quant_offload
autogptq_support = True
try:
import auto_gptq
from auto_gptq import AutoGPTQForCausalLM
except ImportError:
autogptq_support = False
model_backend_type = "GPTQ" model_backend_type = "GPTQ"
@@ -185,6 +162,15 @@ class model_backend(HFTorchInferenceModel):
utils.koboldai_vars.modeldim = self.get_hidden_size() utils.koboldai_vars.modeldim = self.get_hidden_size()
def _get_model(self, location: str, tf_kwargs: Dict): def _get_model(self, location: str, tf_kwargs: Dict):
import gptq
from gptq.gptj import load_quant as gptj_load_quant
from gptq.gptneox import load_quant as gptneox_load_quant
from gptq.llama import load_quant as llama_load_quant
from gptq.opt import load_quant as opt_load_quant
from gptq.bigcode import load_quant as bigcode_load_quant
from gptq.mpt import load_quant as mpt_load_quant
from gptq.offload import load_quant_offload
gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location) gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
v2_bias = False v2_bias = False
@@ -207,7 +193,19 @@ class model_backend(HFTorchInferenceModel):
model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias) model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "gpt_bigcode": elif model_type == "gpt_bigcode":
model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half() model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
elif autogptq_support: else:
try:
import auto_gptq
from auto_gptq import AutoGPTQForCausalLM
except ImportError:
raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
try:
import hf_bleeding_edge
from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
from transformers import AutoModelForCausalLM
# Monkey patch in hf_bleeding_edge to avoid having to trust remote code # Monkey patch in hf_bleeding_edge to avoid having to trust remote code
auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
@@ -227,12 +225,12 @@ class model_backend(HFTorchInferenceModel):
return self.model.generate(*args, **kwargs) return self.model.generate(*args, **kwargs)
type(model).generate = generate type(model).generate = generate
else:
raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
return model return model
def _get_tokenizer(self, location: str): def _get_tokenizer(self, location: str):
from transformers import AutoTokenizer, LlamaTokenizer
model_type = self.get_model_type() model_type = self.get_model_type()
if model_type == "llama": if model_type == "llama":
tokenizer = LlamaTokenizer.from_pretrained(location) tokenizer = LlamaTokenizer.from_pretrained(location)