From e49d35afc935f3a52155a0bc9f9d200a84e1ad41 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 28 May 2023 22:54:36 +0200 Subject: [PATCH] Add 4bit plugin --- aiserver.py | 1 - .../inference_models/4bit_hf_torch/class.py | 227 ++++++++++ .../generic_hf_torch/class.py | 13 +- modeling/inference_models/hf.py | 4 - modeling/inference_models/hf_torch_4bit.py | 392 ------------------ 5 files changed, 233 insertions(+), 404 deletions(-) create mode 100644 modeling/inference_models/4bit_hf_torch/class.py delete mode 100644 modeling/inference_models/hf_torch_4bit.py diff --git a/aiserver.py b/aiserver.py index c28633d6..3c574431 100644 --- a/aiserver.py +++ b/aiserver.py @@ -623,7 +623,6 @@ utils.socketio = socketio # Weird import position to steal koboldai_vars from utils from modeling.patches import patch_transformers -from modeling.inference_models.hf_torch_4bit import load_model_gptq_settings #Load all of the model importers import importlib diff --git a/modeling/inference_models/4bit_hf_torch/class.py b/modeling/inference_models/4bit_hf_torch/class.py new file mode 100644 index 00000000..62f04bfb --- /dev/null +++ b/modeling/inference_models/4bit_hf_torch/class.py @@ -0,0 +1,227 @@ +from __future__ import annotations + +import os +import glob +import json +import torch +import re +import shutil +import sys +from typing import Union + +from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer +from hf_bleeding_edge import AutoModelForCausalLM + +import utils +import modeling.lazy_loader as lazy_loader +import koboldai_settings +from logger import logger, set_logger_verbosity + +try: + import breakmodel +except ModuleNotFoundError as e: + # Breakmodel is only expected to work on GPU + if not utils.koboldai_vars.use_colab_tpu: + raise e + +from modeling.inference_models.hf_torch import HFTorchInferenceModel +from modeling.tokenizer import GenericTokenizer + +# 4-bit dependencies +import gptq +from pathlib import Path +from gptq.gptj import load_quant as gptj_load_quant +from gptq.gptneox import load_quant as gptneox_load_quant +from gptq.llama import load_quant as llama_load_quant +from gptq.opt import load_quant as opt_load_quant +from gptq.mpt import load_quant as mpt_load_quant +from gptq.offload import load_quant_offload + + +model_backend_name = "Huggingface GPTQ" + + +def load_model_gptq_settings(path): + try: + js = json.load(open(path + "/config.json", "r")) + except Exception as e: + return False, -1, -1, False, -1 + + gptq_model = False + gptq_bits = -1 + gptq_groupsize = -1 + gptq_file = False + gptq_version = -1 + + gptq_legacy_files = glob.glob(os.path.join(path, "4bit*.pt")) + glob.glob(os.path.join(path, "4bit*.safetensors")) + if "gptq_bits" in js: + gptq_model = True + gptq_bits = js["gptq_bits"] + gptq_groupsize = js.get("gptq_groupsize", -1) + safetensors_file = os.path.join(path, "model.safetensors") + pt_file = os.path.join(path, "model.ckpt") + gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file + gptq_version = js.get("gptq_version", -1) + elif gptq_legacy_files: + gptq_model = True + gptq_bits = 4 + gptq_file = gptq_legacy_files[0] + fname = Path(gptq_file).parts[-1] + g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) + gptq_groupsize = int(g[0]) if g else -1 + gptq_version = -1 + + return gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version + + +def get_gptq_version(fpath): + v1_strings = ["zeros", "scales", "bias", "qweight"] + v2_strings = ["qzeros", "scales", "bias", "qweight"] + v3_strings = ["qzeros", "scales", "g_idx", "qweight"] + + with open(fpath, "rb") as f: + data = str(f.read(1024*1024)) + + v0 = all([s in data for s in v1_strings]) and not "qzeros" in data + v1 = all([s in data for s in v2_strings]) + v2 = all([s in data for s in v3_strings]) + + if v2: + if v0 or v1: + logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}") + return 2 + if v1: + if v0 or v2: + logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}") + return 1 + if v0: + if v1 or v2: + logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}") + return 0 + + +class model_backend(HFTorchInferenceModel): + def is_valid(self, model_name, model_path, menu_path): + gptq_model, _, _, _, _ = load_model_gptq_settings(model_path) + return gptq_model + + def _load(self, save_model: bool, initial_load: bool) -> None: + # Make model path the same as the model name to make this consistent + # with the other loading method if it isn't a known model type. This + # code is not just a workaround for below, it is also used to make the + # behavior consistent with other loading methods - Henk717 + # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]: + # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model + + self.init_model_config() + + self.lazy_load = False + + gpulayers = breakmodel.gpu_blocks + + try: + self.gpu_layers_list = [int(l) for l in gpulayers.split(",")] + except (ValueError, AttributeError): + self.gpu_layers_list = [utils.num_layers(self.model_config)] + + tf_kwargs = { + "low_cpu_mem_usage": True, + } + + # If we're using torch_lazy_loader, we need to get breakmodel config + # early so that it knows where to load the individual model tensors + logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel)) + if ( + self.lazy_load + and utils.koboldai_vars.hascuda + and utils.koboldai_vars.breakmodel + and not utils.koboldai_vars.nobreakmodel + ): + self.breakmodel_device_config(self.model_config) + + if self.lazy_load: + # If we're using lazy loader, we need to figure out what the model's hidden layers are called + with lazy_loader.use_lazy_load( + dematerialized_modules=True, use_accelerate_init_empty_weights=True + ): + try: + metamodel = AutoModelForCausalLM.from_config(self.model_config) + utils.layers_module_names = utils.get_layers_module_names(metamodel) + utils.module_names = list(metamodel.state_dict().keys()) + utils.named_buffers = list(metamodel.named_buffers(recurse=True)) + except Exception as e: + logger.warning(f"Gave up on lazy loading due to {e}") + self.lazy_load = False + + # Download model from Huggingface if it does not exist, otherwise load locally + with self._maybe_use_float16(), lazy_loader.use_lazy_load( + enable=self.lazy_load, + callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) + if self.lazy_load + else None, + dematerialized_modules=True, + ): + if self.lazy_load: + # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time + tf_kwargs.pop("low_cpu_mem_usage", None) + + if self.get_local_model_path(): + # Model is stored locally, load it. + self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + else: + raise NotImplementedError("GPTQ Model downloading not implemented") + + if not self.lazy_load: + utils.layers_module_names = utils.get_layers_module_names(self.model) + utils.module_names = list(self.model.state_dict().keys()) + utils.named_buffers = list(self.model.named_buffers(recurse=True)) + + if ( + utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default + and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj") + ): + utils.koboldai_vars.badwordsids = [ + [v] + for k, v in self.tokenizer.get_vocab().items() + if any(c in str(k) for c in "[]") + ] + + self.patch_embedding() + + self.model.kai_model = self + utils.koboldai_vars.modeldim = self.get_hidden_size() + + def _get_model(self, location: str, tf_kwargs: Dict): + gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location) + + if gptq_version < 0: + gptq_version = get_gptq_version(gptq_file) + gptq.modelutils.set_gptq_version(gptq_version) + + model_type = self.get_model_type() + + logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}, groupsize {gptq_groupsize}") + if model_type == "gptj": + model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + elif model_type == "gpt_neox": + model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + elif model_type == "llama": + model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + elif model_type == "opt": + model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + elif model_type == "mpt": + model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list) + else: + raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit") + + return model + + def _get_tokenizer(self, location: str): + model_type = self.get_model_type() + if model_type == "llama": + tokenizer = LlamaTokenizer.from_pretrained(location) + else: + tokenizer = AutoTokenizer.from_pretrained(location) + + return GenericTokenizer(tokenizer) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 50d7503c..93bc08ea 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -25,8 +25,12 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel model_backend_name = "Huggingface" -class GenericHFTorchInferenceModel(HFTorchInferenceModel): - def load_config(self) -> None: +class model_backend(HFTorchInferenceModel): + + def _initialize_model(self): + return + + def _load(self, save_model: bool, initial_load: bool) -> None: utils.koboldai_vars.allowsp = True # Make model path the same as the model name to make this consistent @@ -243,11 +247,6 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel): ) shutil.rmtree("cache/") - if not self.lazy_load: - utils.layers_module_names = utils.get_layers_module_names(self.model) - utils.module_names = list(self.model.state_dict().keys()) - utils.named_buffers = list(self.model.named_buffers(recurse=True)) - self.patch_embedding() diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 93e1757a..dc34636a 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -1,12 +1,8 @@ import os, sys from typing import Optional -<<<<<<< HEAD from hf_bleeding_edge import AutoConfig -======= -from transformers import AutoConfig import warnings ->>>>>>> ebolam/Model_Plugins import utils import json import koboldai_settings diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py deleted file mode 100644 index 580fa306..00000000 --- a/modeling/inference_models/hf_torch_4bit.py +++ /dev/null @@ -1,392 +0,0 @@ -from __future__ import annotations - -import os -import glob -import json -import torch -import re -import shutil -import sys -from typing import Union - -from transformers import GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer -from hf_bleeding_edge import AutoModelForCausalLM - -import utils -import modeling.lazy_loader as lazy_loader -import koboldai_settings -from logger import logger, set_logger_verbosity - -try: - import breakmodel -except ModuleNotFoundError as e: - # Breakmodel is only expected to work on GPU - if not utils.koboldai_vars.use_colab_tpu: - raise e - -from modeling.inference_models.hf_torch import HFTorchInferenceModel -from modeling.tokenizer import GenericTokenizer - -# 4-bit dependencies -import gptq -from pathlib import Path -from gptq.gptj import load_quant as gptj_load_quant -from gptq.gptneox import load_quant as gptneox_load_quant -from gptq.llama import load_quant as llama_load_quant -from gptq.opt import load_quant as opt_load_quant -from gptq.mpt import load_quant as mpt_load_quant -from gptq.offload import load_quant_offload - - -def prepare_4bit_load(modelpath): - path_4bit = os.path.join(modelpath, "model.safetensors") - if os.path.isfile(path_4bit): - return path_4bit, False - - path_4bit = os.path.join(modelpath, "model.ckpt") - if os.path.isfile(path_4bit): - return path_4bit, False - - # Legacy format support - paths_4bit = ["4bit*.safetensors", "4bit*.pt", "4bit-old.safetensors", "4bit-old.pt"] - result = False - groupsize = -1 - for p in paths_4bit: - p = os.path.join(modelpath, p) - val = [v for v in glob.glob(p) if "4bit-old" not in v] - if val: - result = val[0] - fname = Path(result).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) - groupsize = -1 - if g: - groupsize = int(g[0]) - break - - return result, groupsize - - -def load_model_gptq_settings(): - try: - js = json.loads(str(model.model_config).partition(' ')[2]) - except Exception as e: - try: - try: - js = json.load(open(utils.koboldai_vars.custmodpth + "/config.json", "r")) - except Exception as e: - js = json.load(open(utils.koboldai_vars.custmodpth.replace('/', '_') + "/config.json", "r")) - except Exception as e: - utils.koboldai_vars.gptq_model = False - return - - gptq_legacy_files = glob.glob(os.path.join(utils.koboldai_vars.custmodpth, "4bit*.pt")) + glob.glob(os.path.join(utils.koboldai_vars.custmodpth, "4bit*.safetensors")) - if "gptq_bits" in js: - utils.koboldai_vars.gptq_model = True - utils.koboldai_vars.gptq_bits = js["gptq_bits"] - utils.koboldai_vars.gptq_groupsize = js.get("gptq_groupsize", -1) - safetensors_file = os.path.join(utils.koboldai_vars.custmodpth, "model.safetensors") - pt_file = os.path.join(utils.koboldai_vars.custmodpth, "model.ckpt") - utils.koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file - utils.koboldai_vars.gptq_version = js.get("gptq_version", -1) - elif gptq_legacy_files: - utils.koboldai_vars.gptq_model = True - utils.koboldai_vars.gptq_bits = 4 - utils.koboldai_vars.gptq_file = gptq_legacy_files[0] - fname = Path(utils.koboldai_vars.gptq_file).parts[-1] - g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) - utils.koboldai_vars.gptq_groupsize = int(g[0]) if g else -1 - utils.koboldai_vars.gptq_version = -1 - else: - utils.koboldai_vars.gptq_model = False - - -def get_gptq_version(fpath): - v1_strings = ["zeros", "scales", "bias", "qweight"] - v2_strings = ["qzeros", "scales", "bias", "qweight"] - v3_strings = ["qzeros", "scales", "g_idx", "qweight"] - - with open(fpath, "rb") as f: - data = str(f.read(1024*1024)) - - v0 = all([s in data for s in v1_strings]) and not "qzeros" in data - v1 = all([s in data for s in v2_strings]) - v2 = all([s in data for s in v3_strings]) - - if v2: - if v0 or v1: - logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}") - return 2 - if v1: - if v0 or v2: - logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}") - return 1 - if v0: - if v1 or v2: - logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}") - return 0 - - -class HFTorch4BitInferenceModel(HFTorchInferenceModel): - def _load(self, save_model: bool, initial_load: bool) -> None: - utils.koboldai_vars.allowsp = True - - # Make model path the same as the model name to make this consistent - # with the other loading method if it isn't a known model type. This - # code is not just a workaround for below, it is also used to make the - # behavior consistent with other loading methods - Henk717 - # if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]: - # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model - - if self.model_name == "NeoCustom": - self.model_name = os.path.basename( - os.path.normpath(utils.koboldai_vars.custmodpth) - ) - utils.koboldai_vars.model = self.model_name - - self.init_model_config() - - gpulayers = utils.args.breakmodel_gpulayers - - try: - self.gpu_layers_list = [int(l) for l in gpulayers.split(",")] - except (ValueError, AttributeError): - self.gpu_layers_list = [utils.num_layers(self.model_config)] - - tf_kwargs = { - "low_cpu_mem_usage": True, - } - - # If we're using torch_lazy_loader, we need to get breakmodel config - # early so that it knows where to load the individual model tensors - if ( - self.lazy_load - and utils.koboldai_vars.hascuda - and utils.koboldai_vars.breakmodel - and not utils.koboldai_vars.nobreakmodel - ): - self.breakmodel_device_config(self.model_config) - - if self.lazy_load: - # If we're using lazy loader, we need to figure out what the model's hidden layers are called - with lazy_loader.use_lazy_load( - dematerialized_modules=True, use_accelerate_init_empty_weights=True - ): - try: - metamodel = AutoModelForCausalLM.from_config(self.model_config) - utils.layers_module_names = utils.get_layers_module_names(metamodel) - utils.module_names = list(metamodel.state_dict().keys()) - utils.named_buffers = list(metamodel.named_buffers(recurse=True)) - except Exception as e: - logger.warning(f"Gave up on lazy loading due to {e}") - self.lazy_load = False - - # Download model from Huggingface if it does not exist, otherwise load locally - with self._maybe_use_float16(), lazy_loader.use_lazy_load( - enable=self.lazy_load, - callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) - if self.lazy_load - else None, - dematerialized_modules=True, - ): - if self.lazy_load: - # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time - tf_kwargs.pop("low_cpu_mem_usage", None) - - if self.get_local_model_path(): - # Model is stored locally, load it. - self.model = self._get_model(self.get_local_model_path(), tf_kwargs) - self.tokenizer = self._get_tokenizer(self.get_local_model_path()) - else: - # Model not stored locally, we need to download it. - - # _rebuild_tensor patch for casting dtype and supporting LazyTensors - old_rebuild_tensor = torch._utils._rebuild_tensor - - def new_rebuild_tensor( - storage: Union[lazy_loader.LazyTensor, torch.Storage], - storage_offset, - shape, - stride, - ): - if not isinstance(storage, lazy_loader.LazyTensor): - dtype = storage.dtype - else: - dtype = storage.storage_type.dtype - if not isinstance(dtype, torch.dtype): - dtype = storage.storage_type(0).dtype - if dtype is torch.float32 and len(shape) >= 2: - utils.koboldai_vars.fp32_model = True - return old_rebuild_tensor(storage, storage_offset, shape, stride) - - torch._utils._rebuild_tensor = new_rebuild_tensor - self.model = self._get_model(self.model_name, tf_kwargs) - self.tokenizer = self._get_tokenizer(self.model_name) - torch._utils._rebuild_tensor = old_rebuild_tensor - - if save_model: - self.tokenizer.save_pretrained( - self.get_local_model_path(ignore_existance=True) - ) - - if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks: - # Use save_pretrained to convert fp32 models to fp16, - # unless we are using disk cache because save_pretrained - # is not supported in that case - self.model = self.model.half() - self.model.save_pretrained( - self.get_local_model_path(ignore_existance=True), - max_shard_size="500MiB", - ) - - else: - # For fp16 models, we can just copy the model files directly - import transformers.configuration_utils - import transformers.modeling_utils - import transformers.file_utils - import huggingface_hub - - # Save the config.json - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - self.model_name, - transformers.configuration_utils.CONFIG_NAME, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=False, - ) - ), - os.path.join( - self.get_local_model_path(ignore_existance=True), - transformers.configuration_utils.CONFIG_NAME, - ), - ) - - if utils.num_shards is None: - # Save the pytorch_model.bin or model.safetensors of an unsharded model - any_success = False - possible_checkpoint_names = [ - transformers.modeling_utils.WEIGHTS_NAME, - "model.safetensors", - ] - - for possible_checkpoint_name in possible_checkpoint_names: - try: - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - self.model_name, - possible_checkpoint_name, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=False, - ) - ), - os.path.join( - self.get_local_model_path( - ignore_existance=True - ), - possible_checkpoint_name, - ), - ) - any_success = True - except Exception: - pass - - if not any_success: - raise RuntimeError(f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'") - else: - # Handle saving sharded models - - with open(utils.from_pretrained_index_filename) as f: - map_data = json.load(f) - filenames = set(map_data["weight_map"].values()) - # Save the pytorch_model.bin.index.json of a sharded model - shutil.move( - os.path.realpath(utils.from_pretrained_index_filename), - os.path.join( - self.get_local_model_path(ignore_existance=True), - transformers.modeling_utils.WEIGHTS_INDEX_NAME, - ), - ) - # Then save the pytorch_model-#####-of-#####.bin files - for filename in filenames: - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - self.model_name, - filename, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=False, - ) - ), - os.path.join( - self.get_local_model_path( - ignore_existance=True - ), - filename, - ), - ) - shutil.rmtree("cache/") - - if not self.lazy_load: - utils.layers_module_names = utils.get_layers_module_names(self.model) - utils.module_names = list(self.model.state_dict().keys()) - utils.named_buffers = list(self.model.named_buffers(recurse=True)) - - if ( - utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default - and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj") - ): - utils.koboldai_vars.badwordsids = [ - [v] - for k, v in self.tokenizer.get_vocab().items() - if any(c in str(k) for c in "[]") - ] - - self.patch_embedding() - - self.model.kai_model = self - utils.koboldai_vars.modeldim = self.get_hidden_size() - - def _get_model(self, location: str, tf_kwargs: Dict): - if not utils.koboldai_vars.custmodpth: - pass - groupsize = utils.koboldai_vars.gptq_groupsize - - path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth) - - if utils.koboldai_vars.gptq_version < 0: - utils.koboldai_vars.gptq_version = get_gptq_version(path_4bit) - gptq.modelutils.set_gptq_version(utils.koboldai_vars.gptq_version) - - if legacy_groupsize is not False: - groupsize = legacy_groupsize - - logger.info(f"Using GPTQ file: {path_4bit}, {utils.koboldai_vars.gptq_bits}-bit model, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}") - if utils.koboldai_vars.model_type == "gptj": - model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) - elif utils.koboldai_vars.model_type == "gpt_neox": - model = load_quant_offload(gptneox_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) - elif utils.koboldai_vars.model_type == "llama": - model = load_quant_offload(llama_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) - elif utils.koboldai_vars.model_type == "opt": - model = load_quant_offload(opt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) - elif utils.koboldai_vars.model_type == "mpt": - model = load_quant_offload(mpt_load_quant, utils.koboldai_vars.custmodpth, path_4bit, utils.koboldai_vars.gptq_bits, groupsize, self.gpu_layers_list) - else: - raise RuntimeError(f"4-bit load failed. Model type {utils.koboldai_vars.model_type} not supported in 4-bit") - - return model - - def _get_tokenizer(self, location: str): - if utils.koboldai_vars.model_type == "llama": - tokenizer = LlamaTokenizer.from_pretrained(utils.koboldai_vars.custmodpth) - else: - tokenizer = AutoTokenizer.from_pretrained(utils.koboldai_vars.custmodpth) - - return GenericTokenizer(tokenizer)