mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Merge branch 'united' of https://github.com/henk717/KoboldAI into fixing-time
This commit is contained in:
@@ -6,7 +6,13 @@ import torch
|
||||
import shutil
|
||||
from typing import Union
|
||||
|
||||
from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
|
||||
from transformers import GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
|
||||
try:
|
||||
from hf_bleeding_edge import AutoModelForCausalLM
|
||||
except ImportError:
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
|
||||
|
||||
import utils
|
||||
import modeling.lazy_loader as lazy_loader
|
||||
@@ -25,6 +31,19 @@ class model_backend(HFTorchInferenceModel):
|
||||
super().__init__()
|
||||
self.use_4_bit = False
|
||||
|
||||
def is_valid(self, model_name, model_path, menu_path):
|
||||
base_is_valid = super().is_valid(model_name, model_path, menu_path)
|
||||
path = False
|
||||
gen_path = "models/{}".format(model_name.replace('/', '_'))
|
||||
if model_path is not None and os.path.exists(model_path):
|
||||
path = model_path
|
||||
elif os.path.exists(gen_path):
|
||||
path = gen_path
|
||||
|
||||
fnames = [WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME]
|
||||
|
||||
return base_is_valid and any(os.path.exists(os.path.join(path, fname)) for fname in fnames)
|
||||
|
||||
def _initialize_model(self):
|
||||
return
|
||||
|
||||
@@ -39,23 +58,24 @@ class model_backend(HFTorchInferenceModel):
|
||||
else:
|
||||
temp = {}
|
||||
requested_parameters.append({
|
||||
"uitype": "toggle",
|
||||
"unit": "bool",
|
||||
"label": "Use 4-bit",
|
||||
"id": "use_4_bit",
|
||||
"default": temp['use_4_bit'] if 'use_4_bit' in temp else False,
|
||||
"tooltip": "Whether or not to use BnB's 4-bit mode",
|
||||
"uitype": "dropdown",
|
||||
"unit": "text",
|
||||
"label": "Quantization",
|
||||
"id": "quantization",
|
||||
"default": temp['quantization'] if 'quantization' in temp else '4bit' if dependency_exists else '16-bit',
|
||||
"tooltip": "Whether or not to use BnB's 4-bit or 8-bit mode",
|
||||
"menu_path": "Layers",
|
||||
"children": [{'text': '4-bit', 'value': '4bit'}, {'text': '8-bit', 'value': '8bit'}, {'text': '16-bit', 'value':'16-bit'}],
|
||||
"extra_classes": "",
|
||||
"refresh_model_inputs": False
|
||||
})
|
||||
else:
|
||||
logger.warning("Bitsandbytes is not installed, you can not use Huggingface models in 4-bit")
|
||||
logger.warning("Bitsandbytes is not installed, you can not use Quantization for Huggingface models")
|
||||
return requested_parameters
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
super().set_input_parameters(parameters)
|
||||
self.use_4_bit = parameters['use_4_bit'] if 'use_4_bit' in parameters else False
|
||||
self.quantization = parameters['quantization'] if 'quantization' in parameters else False
|
||||
|
||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||
utils.koboldai_vars.allowsp = True
|
||||
@@ -85,13 +105,22 @@ class model_backend(HFTorchInferenceModel):
|
||||
"low_cpu_mem_usage": True,
|
||||
}
|
||||
|
||||
if self.use_4_bit or utils.koboldai_vars.colab_arg:
|
||||
if self.quantization == "8bit":
|
||||
tf_kwargs.update({
|
||||
"quantization_config":BitsAndBytesConfig(
|
||||
load_in_8bit=True,
|
||||
llm_int8_enable_fp32_cpu_offload=True
|
||||
),
|
||||
})
|
||||
|
||||
if self.quantization == "4bit" or utils.koboldai_vars.colab_arg:
|
||||
tf_kwargs.update({
|
||||
"quantization_config":BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_quant_type='nf4'
|
||||
bnb_4bit_quant_type='nf4',
|
||||
llm_int8_enable_fp32_cpu_offload=True
|
||||
),
|
||||
})
|
||||
|
||||
@@ -99,10 +128,16 @@ class model_backend(HFTorchInferenceModel):
|
||||
# We must disable low_cpu_mem_usage and if using a GPT-2 model
|
||||
# because GPT-2 is not compatible with this feature yet.
|
||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
||||
|
||||
tf_kwargs.pop("quantization_config", None)
|
||||
|
||||
# Also, lazy loader doesn't support GPT-2 models
|
||||
self.lazy_load = False
|
||||
|
||||
if self.model_type == "llama":
|
||||
tf_kwargs.update({
|
||||
"pretraining_tp": 1 # Workaround recommended by HF to fix their mistake on the config.json tuners adopted
|
||||
})
|
||||
|
||||
logger.debug(
|
||||
"lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(
|
||||
self.lazy_load,
|
||||
@@ -299,7 +334,7 @@ class model_backend(HFTorchInferenceModel):
|
||||
"disk_layers": self.disk_layers
|
||||
if "disk_layers" in vars(self)
|
||||
else 0,
|
||||
"use_4_bit": self.use_4_bit,
|
||||
"quantization": self.quantization,
|
||||
},
|
||||
f,
|
||||
indent="",
|
||||
|
Reference in New Issue
Block a user