mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Merge upstream changes, fix conflict, adapt backends to changes
This commit is contained in:
@@ -32,6 +32,7 @@ from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
|
|||||||
from transformers import LlamaTokenizer
|
from transformers import LlamaTokenizer
|
||||||
from exllama.generator import ExLlamaGenerator
|
from exllama.generator import ExLlamaGenerator
|
||||||
|
|
||||||
|
model_backend_type = "GPTQ"
|
||||||
model_backend_name = "ExLlama"
|
model_backend_name = "ExLlama"
|
||||||
|
|
||||||
# When set to true, messages will appear in the console if samplers are not
|
# When set to true, messages will appear in the console if samplers are not
|
||||||
|
@@ -18,13 +18,6 @@ import modeling.lazy_loader as lazy_loader
|
|||||||
import koboldai_settings
|
import koboldai_settings
|
||||||
from logger import logger, set_logger_verbosity
|
from logger import logger, set_logger_verbosity
|
||||||
|
|
||||||
try:
|
|
||||||
import breakmodel
|
|
||||||
except ModuleNotFoundError as e:
|
|
||||||
# Breakmodel is only expected to work on GPU
|
|
||||||
if not utils.koboldai_vars.use_colab_tpu:
|
|
||||||
raise e
|
|
||||||
|
|
||||||
from modeling.inference_models.hf_torch import HFTorchInferenceModel
|
from modeling.inference_models.hf_torch import HFTorchInferenceModel
|
||||||
from modeling.tokenizer import GenericTokenizer
|
from modeling.tokenizer import GenericTokenizer
|
||||||
|
|
||||||
@@ -47,6 +40,7 @@ except ImportError:
|
|||||||
autogptq_support = False
|
autogptq_support = False
|
||||||
|
|
||||||
|
|
||||||
|
model_backend_type = "GPTQ"
|
||||||
model_backend_name = "Huggingface GPTQ"
|
model_backend_name = "Huggingface GPTQ"
|
||||||
|
|
||||||
|
|
||||||
@@ -112,7 +106,7 @@ def get_gptq_version(fpath):
|
|||||||
class model_backend(HFTorchInferenceModel):
|
class model_backend(HFTorchInferenceModel):
|
||||||
def is_valid(self, model_name, model_path, menu_path):
|
def is_valid(self, model_name, model_path, menu_path):
|
||||||
gptq_model, _, _, _, _ = load_model_gptq_settings(model_path)
|
gptq_model, _, _, _, _ = load_model_gptq_settings(model_path)
|
||||||
return gptq_model
|
return bool(gptq_model)
|
||||||
|
|
||||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||||
# Make model path the same as the model name to make this consistent
|
# Make model path the same as the model name to make this consistent
|
||||||
@@ -126,7 +120,7 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
|
|
||||||
self.lazy_load = False
|
self.lazy_load = False
|
||||||
|
|
||||||
gpulayers = breakmodel.gpu_blocks
|
gpulayers = self.breakmodel_config.gpu_blocks
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.gpu_layers_list = [int(l) for l in gpulayers.split(",")]
|
self.gpu_layers_list = [int(l) for l in gpulayers.split(",")]
|
||||||
@@ -149,42 +143,28 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
self.breakmodel_device_config(self.model_config)
|
self.breakmodel_device_config(self.model_config)
|
||||||
|
|
||||||
if self.lazy_load:
|
if self.lazy_load:
|
||||||
|
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
||||||
|
tf_kwargs.pop("low_cpu_mem_usage", None)
|
||||||
|
|
||||||
# If we're using lazy loader, we need to figure out what the model's hidden layers are called
|
# If we're using lazy loader, we need to figure out what the model's hidden layers are called
|
||||||
with lazy_loader.use_lazy_load(
|
with lazy_loader.use_lazy_load(dematerialized_modules=True):
|
||||||
dematerialized_modules=True, use_accelerate_init_empty_weights=True
|
|
||||||
):
|
|
||||||
try:
|
try:
|
||||||
metamodel = AutoModelForCausalLM.from_config(self.model_config)
|
metamodel = AutoModelForCausalLM.from_config(self.model_config)
|
||||||
utils.layers_module_names = utils.get_layers_module_names(metamodel)
|
utils.layers_module_names = utils.get_layers_module_names(metamodel)
|
||||||
utils.module_names = list(metamodel.state_dict().keys())
|
utils.module_names = list(metamodel.state_dict().keys())
|
||||||
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
|
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if utils.args.panic:
|
||||||
|
raise e
|
||||||
logger.warning(f"Gave up on lazy loading due to {e}")
|
logger.warning(f"Gave up on lazy loading due to {e}")
|
||||||
self.lazy_load = False
|
self.lazy_load = False
|
||||||
|
|
||||||
# Download model from Huggingface if it does not exist, otherwise load locally
|
if self.get_local_model_path():
|
||||||
with self._maybe_use_float16(), lazy_loader.use_lazy_load(
|
# Model is stored locally, load it.
|
||||||
enable=self.lazy_load,
|
self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
|
||||||
callback=self._get_lazy_load_callback(utils.num_layers(self.model_config))
|
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
|
||||||
if self.lazy_load
|
else:
|
||||||
else None,
|
raise NotImplementedError("GPTQ Model downloading not implemented")
|
||||||
dematerialized_modules=True,
|
|
||||||
):
|
|
||||||
if self.lazy_load:
|
|
||||||
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
|
||||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
|
||||||
|
|
||||||
if self.get_local_model_path():
|
|
||||||
# Model is stored locally, load it.
|
|
||||||
self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
|
|
||||||
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
|
|
||||||
else:
|
|
||||||
raise NotImplementedError("GPTQ Model downloading not implemented")
|
|
||||||
|
|
||||||
if not self.lazy_load:
|
|
||||||
utils.layers_module_names = utils.get_layers_module_names(self.model)
|
|
||||||
utils.module_names = list(self.model.state_dict().keys())
|
|
||||||
utils.named_buffers = list(self.model.named_buffers(recurse=True))
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default
|
utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default
|
||||||
|
Reference in New Issue
Block a user