From 9aa6c5fbbfcb9a2f22f38fc9baa07e5baa033361 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 19 Jul 2023 06:56:09 +0200 Subject: [PATCH] Merge upstream changes, fix conflict, adapt backends to changes --- modeling/inference_models/exllama/class.py | 1 + .../inference_models/gptq_hf_torch/class.py | 50 ++++++------------- 2 files changed, 16 insertions(+), 35 deletions(-) diff --git a/modeling/inference_models/exllama/class.py b/modeling/inference_models/exllama/class.py index 1caa2afd..21eba58e 100644 --- a/modeling/inference_models/exllama/class.py +++ b/modeling/inference_models/exllama/class.py @@ -32,6 +32,7 @@ from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig from transformers import LlamaTokenizer from exllama.generator import ExLlamaGenerator +model_backend_type = "GPTQ" model_backend_name = "ExLlama" # When set to true, messages will appear in the console if samplers are not diff --git a/modeling/inference_models/gptq_hf_torch/class.py b/modeling/inference_models/gptq_hf_torch/class.py index d07aef23..16d3db91 100644 --- a/modeling/inference_models/gptq_hf_torch/class.py +++ b/modeling/inference_models/gptq_hf_torch/class.py @@ -18,13 +18,6 @@ import modeling.lazy_loader as lazy_loader import koboldai_settings from logger import logger, set_logger_verbosity -try: - import breakmodel -except ModuleNotFoundError as e: - # Breakmodel is only expected to work on GPU - if not utils.koboldai_vars.use_colab_tpu: - raise e - from modeling.inference_models.hf_torch import HFTorchInferenceModel from modeling.tokenizer import GenericTokenizer @@ -47,6 +40,7 @@ except ImportError: autogptq_support = False +model_backend_type = "GPTQ" model_backend_name = "Huggingface GPTQ" @@ -112,7 +106,7 @@ def get_gptq_version(fpath): class model_backend(HFTorchInferenceModel): def is_valid(self, model_name, model_path, menu_path): gptq_model, _, _, _, _ = load_model_gptq_settings(model_path) - return gptq_model + return bool(gptq_model) def _load(self, save_model: bool, initial_load: bool) -> None: # Make model path the same as the model name to make this consistent @@ -126,7 +120,7 @@ class model_backend(HFTorchInferenceModel): self.lazy_load = False - gpulayers = breakmodel.gpu_blocks + gpulayers = self.breakmodel_config.gpu_blocks try: self.gpu_layers_list = [int(l) for l in gpulayers.split(",")] @@ -149,42 +143,28 @@ class model_backend(HFTorchInferenceModel): self.breakmodel_device_config(self.model_config) if self.lazy_load: + # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time + tf_kwargs.pop("low_cpu_mem_usage", None) + # If we're using lazy loader, we need to figure out what the model's hidden layers are called - with lazy_loader.use_lazy_load( - dematerialized_modules=True, use_accelerate_init_empty_weights=True - ): + with lazy_loader.use_lazy_load(dematerialized_modules=True): try: metamodel = AutoModelForCausalLM.from_config(self.model_config) utils.layers_module_names = utils.get_layers_module_names(metamodel) utils.module_names = list(metamodel.state_dict().keys()) utils.named_buffers = list(metamodel.named_buffers(recurse=True)) except Exception as e: + if utils.args.panic: + raise e logger.warning(f"Gave up on lazy loading due to {e}") self.lazy_load = False - # Download model from Huggingface if it does not exist, otherwise load locally - with self._maybe_use_float16(), lazy_loader.use_lazy_load( - enable=self.lazy_load, - callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) - if self.lazy_load - else None, - dematerialized_modules=True, - ): - if self.lazy_load: - # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time - tf_kwargs.pop("low_cpu_mem_usage", None) - - if self.get_local_model_path(): - # Model is stored locally, load it. - self.model = self._get_model(self.get_local_model_path(), tf_kwargs) - self.tokenizer = self._get_tokenizer(self.get_local_model_path()) - else: - raise NotImplementedError("GPTQ Model downloading not implemented") - - if not self.lazy_load: - utils.layers_module_names = utils.get_layers_module_names(self.model) - utils.module_names = list(self.model.state_dict().keys()) - utils.named_buffers = list(self.model.named_buffers(recurse=True)) + if self.get_local_model_path(): + # Model is stored locally, load it. + self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + else: + raise NotImplementedError("GPTQ Model downloading not implemented") if ( utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default