Merge pull request #367 from 0cc4m/4bit-plugin

GPTQ module
This commit is contained in:
henk717
2023-07-23 22:32:20 +02:00
committed by GitHub
10 changed files with 344 additions and 8 deletions

50
README_GPTQ.md Normal file
View File

@@ -0,0 +1,50 @@
### Install/Use Guide
(This guide is for both Linux and Windows and assumes user has git installed and a basic grasp of command line use)
#### Installation
In the command prompt/command line navigate to where you want the KoboldAI subfolder to be created.
Note: do not run your command prompt as administrator/with elevated priviledges, reports suggest this leads to problems.
`git clone https://github.com/0cc4m/KoboldAI -b latestgptq --recurse-submodules`
`cd KoboldAI`
Next step, (Windows) subfolder mode or B: option doesn't matter choose either
* [if on Windows]
```
install_requirements.bat
```
* if it closes the window when it finishes, reopen a command prompt and navigate back to your KoboldAI directory.
* [if on Linux with Nvidia]
```
./install_requirements.sh
```
* [if on Linux with AMD]
```
./install_requirements.sh rocm
./commandline-rocm.sh
pip install git+https://github.com/0cc4m/GPTQ-for-LLaMa@c884b421a233f9603d8224c9b22c2d83dd2c1fc4
```
* If you get error missing hip/hip_runtime_xxx.h you dont have proper rocm & hip pkg installed
* If you get CUDA_HOME envar is not set run in env:
`pip3 install torch --index-url https://download.pytorch.org/whl/rocm5.4.2 --force-reinstall`
#### Setting up models
If you haven't already done so, create a model folder with the same name as your model (or whatever you want to name the folder)
Put your 4bit quantized .pt or .safetensors in that folder with all associated .json files and tokenizer.model (.json files and tokenizer.model should be from the Huggingface model folder of the same model type).
Then move your model folder to KoboldAI/models, and rename the .pt or .safetensors file in your model folder to `4bit.pt` or `4bit.safetensors` for non-groupsize models or `4bit-<groupsize>g.pt` or `4bit-<groupsize>.safetensors` for a groupsize mode (Example: `4bit-128g.safetensors`)
So - your .pt's model folder should look like this: "4bit.pt, config.json, generation_config.json, pytorch_model.bin.index.json, special_tokens_map.json, tokenizer.model, tokenizer_config.json" Note: the 4bit.pt file can be in the same folder as the regular HF .bin files it was quantized from, it'll load the quantized model.
#### Running KoboldAI and loading 4bit models
If you haven't done so already, exit the command prompt/leave KAI's conda env. (Close the commandline window on Windows, run `exit` on Linux)
Run `play.bat` [windows], `play.sh` [linux Nvidia], or `play-rocm.sh` [linux AMD]
Switch to UI2, then load your model.

View File

@@ -46,5 +46,11 @@ dependencies:
- ftfy
- pydub
- diffusers
- git+https://github.com/0cc4m/hf_bleeding_edge/
- --find-links=https://0cc4m.github.io/GPTQ-for-LLaMa/gptq-whl-links.html
- gptq_koboldai==0.0.6
- einops
- peft==0.3.0
- scipy
- --find-links=https://0cc4m.github.io/exllama/exllama-whl-links.html
- exllama==0.0.6

View File

@@ -41,4 +41,6 @@ dependencies:
- ftfy
- pydub
- diffusers
- git+https://github.com/0cc4m/hf_bleeding_edge/
- einops
- peft==0.3.0

View File

@@ -6,7 +6,13 @@ import torch
import shutil
from typing import Union
from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
from transformers import GPTNeoForCausalLM, GPT2LMHeadModel, BitsAndBytesConfig
try:
from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
from transformers import AutoModelForCausalLM
from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME
import utils
import modeling.lazy_loader as lazy_loader
@@ -21,7 +27,19 @@ model_backend_name = "Huggingface"
model_backend_type = "Huggingface" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)
class model_backend(HFTorchInferenceModel):
def is_valid(self, model_name, model_path, menu_path):
base_is_valid = super().is_valid(model_name, model_path, menu_path)
path = False
gen_path = "models/{}".format(model_name.replace('/', '_'))
if model_path is not None and os.path.exists(model_path):
path = model_path
elif os.path.exists(gen_path):
path = gen_path
fnames = [WEIGHTS_NAME, WEIGHTS_INDEX_NAME, TF2_WEIGHTS_NAME, TF2_WEIGHTS_INDEX_NAME, TF_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FLAX_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME]
return base_is_valid and any(os.path.exists(os.path.join(path, fname)) for fname in fnames)
def _initialize_model(self):
return

View File

@@ -0,0 +1,240 @@
from __future__ import annotations
import os
import glob
import json
import torch
import re
import shutil
import sys
from typing import Union
import utils
import modeling.lazy_loader as lazy_loader
import koboldai_settings
from logger import logger, set_logger_verbosity
from modeling.inference_models.hf_torch import HFTorchInferenceModel
from modeling.tokenizer import GenericTokenizer
from pathlib import Path
model_backend_type = "GPTQ"
model_backend_name = "Huggingface GPTQ"
def load_model_gptq_settings(path):
try:
js = json.load(open(path + "/config.json", "r"))
except Exception as e:
return False, -1, -1, False, -1
gptq_model = False
gptq_bits = -1
gptq_groupsize = -1
gptq_file = False
gptq_version = -1
gptq_legacy_files = glob.glob(os.path.join(path, "*4bit*.pt")) + glob.glob(os.path.join(path, "*4bit*.safetensors"))
if "gptq_bits" in js:
gptq_model = True
gptq_bits = js["gptq_bits"]
gptq_groupsize = js.get("gptq_groupsize", -1)
safetensors_file = os.path.join(path, "model.safetensors")
pt_file = os.path.join(path, "model.ckpt")
gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file
gptq_version = js.get("gptq_version", -1)
elif gptq_legacy_files:
gptq_model = True
gptq_bits = 4
gptq_file = gptq_legacy_files[0]
fname = Path(gptq_file).parts[-1]
g = re.findall("(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
gptq_groupsize = int(g[0]) if g else -1
gptq_version = -1
return gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version
def get_gptq_version(fpath):
v1_strings = ["zeros", "scales", "bias", "qweight"]
v2_strings = ["qzeros", "scales", "bias", "qweight"]
v3_strings = ["qzeros", "scales", "g_idx", "qweight"]
with open(fpath, "rb") as f:
data = str(f.read(1024*1024))
v0 = all([s in data for s in v1_strings]) and not "qzeros" in data
v1 = all([s in data for s in v2_strings])
v2 = all([s in data for s in v3_strings])
if v2:
if v0:
logger.warning(f"GPTQ model identified as v2, but v0={v0}")
return 2, v1
if v1:
if v0 or v2:
logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}")
return 1, False
if v0:
if v1 or v2:
logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
return 0, False
class model_backend(HFTorchInferenceModel):
def is_valid(self, model_name, model_path, menu_path):
gptq_model, _, _, _, _ = load_model_gptq_settings(model_path)
return bool(gptq_model)
def _load(self, save_model: bool, initial_load: bool) -> None:
# Make model path the same as the model name to make this consistent
# with the other loading method if it isn't a known model type. This
# code is not just a workaround for below, it is also used to make the
# behavior consistent with other loading methods - Henk717
# if utils.koboldai_vars.model not in ["NeoCustom", "GPT2Custom"]:
# utils.koboldai_vars.custmodpth = utils.koboldai_vars.model
self.init_model_config()
self.lazy_load = False
gpulayers = self.breakmodel_config.gpu_blocks
try:
self.gpu_layers_list = [int(l) for l in gpulayers.split(",")]
except (ValueError, AttributeError):
self.gpu_layers_list = [utils.num_layers(self.model_config)]
tf_kwargs = {
"low_cpu_mem_usage": True,
}
# If we're using torch_lazy_loader, we need to get breakmodel config
# early so that it knows where to load the individual model tensors
logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel))
if (
self.lazy_load
and utils.koboldai_vars.hascuda
and utils.koboldai_vars.breakmodel
and not utils.koboldai_vars.nobreakmodel
):
self.breakmodel_device_config(self.model_config)
if self.lazy_load:
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
tf_kwargs.pop("low_cpu_mem_usage", None)
# If we're using lazy loader, we need to figure out what the model's hidden layers are called
with lazy_loader.use_lazy_load(dematerialized_modules=True):
try:
metamodel = AutoModelForCausalLM.from_config(self.model_config)
utils.layers_module_names = utils.get_layers_module_names(metamodel)
utils.module_names = list(metamodel.state_dict().keys())
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
except Exception as e:
if utils.args.panic:
raise e
logger.warning(f"Gave up on lazy loading due to {e}")
self.lazy_load = False
if self.get_local_model_path():
# Model is stored locally, load it.
self.model = self._get_model(self.get_local_model_path(), tf_kwargs)
self.tokenizer = self._get_tokenizer(self.get_local_model_path())
else:
raise NotImplementedError("GPTQ Model downloading not implemented")
if (
utils.koboldai_vars.badwordsids is koboldai_settings.badwordsids_default
and utils.koboldai_vars.model_type not in ("gpt2", "gpt_neo", "gptj")
):
utils.koboldai_vars.badwordsids = [
[v]
for k, v in self.tokenizer.get_vocab().items()
if any(c in str(k) for c in "[]")
]
self.patch_embedding()
self.model.kai_model = self
utils.koboldai_vars.modeldim = self.get_hidden_size()
def _get_model(self, location: str, tf_kwargs: Dict):
import gptq
from gptq.gptj import load_quant as gptj_load_quant
from gptq.gptneox import load_quant as gptneox_load_quant
from gptq.llama import load_quant as llama_load_quant
from gptq.opt import load_quant as opt_load_quant
from gptq.bigcode import load_quant as bigcode_load_quant
from gptq.mpt import load_quant as mpt_load_quant
from gptq.offload import load_quant_offload
gptq_model, gptq_bits, gptq_groupsize, gptq_file, gptq_version = load_model_gptq_settings(location)
v2_bias = False
if gptq_version < 0:
gptq_version, v2_bias = get_gptq_version(gptq_file)
gptq.modelutils.set_gptq_version(gptq_version)
model_type = self.get_model_type()
logger.info(f"Using GPTQ file: {gptq_file}, {gptq_bits}-bit model, type {model_type}, version {gptq_version}{' (with bias)' if v2_bias else ''}, groupsize {gptq_groupsize}")
if model_type == "gptj":
model = load_quant_offload(gptj_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "gpt_neox":
model = load_quant_offload(gptneox_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "llama":
model = load_quant_offload(llama_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "opt":
model = load_quant_offload(opt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "mpt":
model = load_quant_offload(mpt_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias)
elif model_type == "gpt_bigcode":
model = load_quant_offload(bigcode_load_quant, location, gptq_file, gptq_bits, gptq_groupsize, self.gpu_layers_list, force_bias=v2_bias).half()
else:
try:
import auto_gptq
from auto_gptq import AutoGPTQForCausalLM
except ImportError:
raise RuntimeError(f"4-bit load failed. Model type {model_type} not supported in 4-bit")
try:
import hf_bleeding_edge
from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
from transformers import AutoModelForCausalLM
# Monkey patch in hf_bleeding_edge to avoid having to trust remote code
auto_gptq.modeling._utils.AutoConfig = hf_bleeding_edge.AutoConfig
auto_gptq.modeling._base.AutoConfig = hf_bleeding_edge.AutoConfig
auto_gptq.modeling._base.AutoModelForCausalLM = hf_bleeding_edge.AutoModelForCausalLM
model = AutoGPTQForCausalLM.from_quantized(location, model_basename=Path(gptq_file).stem, use_safetensors=gptq_file.endswith(".safetensors"))
# Patch in embeddings function
def get_input_embeddings(self):
return self.model.get_input_embeddings()
type(model).get_input_embeddings = get_input_embeddings
# Patch in args support..
def generate(self, *args, **kwargs):
"""shortcut for model.generate"""
with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
return self.model.generate(*args, **kwargs)
type(model).generate = generate
return model
def _get_tokenizer(self, location: str):
from transformers import AutoTokenizer, LlamaTokenizer
model_type = self.get_model_type()
if model_type == "llama":
tokenizer = LlamaTokenizer.from_pretrained(location)
else:
tokenizer = AutoTokenizer.from_pretrained(location)
return GenericTokenizer(tokenizer)

View File

@@ -1,6 +1,10 @@
import os, sys
from typing import Optional
from transformers import AutoConfig
try:
from hf_bleeding_edge import AutoConfig
except ImportError:
from transformers import AutoConfig
import warnings
import utils
import json
@@ -383,7 +387,17 @@ class HFInferenceModel(InferenceModel):
revision=utils.koboldai_vars.revision,
cache_dir="cache",
)
self.model_type = self.model_config.model_type
if "gptq_bits" in dir(self.model_config):
self.gptq_model = True
self.gptq_bits = self.model_config.gptq_bits
self.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
self.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
self.gptq_file = None
else:
self.gptq_model = False
except ValueError:
self.model_type = {
"NeoCustom": "gpt_neo",
@@ -394,4 +408,4 @@ class HFInferenceModel(InferenceModel):
logger.warning(
"No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)"
)
self.model_type = "gpt_neo"
self.model_type = "gpt_neo"

View File

@@ -17,9 +17,12 @@ from transformers import (
StoppingCriteria,
GPTNeoForCausalLM,
GPT2LMHeadModel,
AutoModelForCausalLM,
LogitsProcessorList,
)
try:
from hf_bleeding_edge import AutoModelForCausalLM
except ImportError:
from transformers import AutoModelForCausalLM
import utils
import modeling.lazy_loader as lazy_loader

View File

@@ -38,5 +38,8 @@ pytest-html==3.2.0
pytest-metadata==2.0.4
requests-mock==1.10.0
safetensors==0.3.1
git+https://github.com/0cc4m/hf_bleeding_edge/
--find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html gptq_koboldai==0.0.4
einops
peft==0.3.0
scipy

View File

@@ -393,4 +393,4 @@
</div>
<div id="notification-container"></div>
<div id="notification-container"></div>

View File

@@ -184,7 +184,7 @@ def decodenewlines(txt):
# Returns number of layers given an HF model config
#==================================================================#
def num_layers(config):
return config["n_layer"] if isinstance(config, dict) else config.num_layers if hasattr(config, "num_layers") else config.n_layer if hasattr(config, "n_layer") else config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else None
return config["n_layer"] if isinstance(config, dict) else config.num_layers if hasattr(config, "num_layers") else config.n_layer if hasattr(config, "n_layer") else config.num_hidden_layers if hasattr(config, 'num_hidden_layers') else config.n_layers if hasattr(config, "n_layers") else None
#==================================================================#
# Downloads huggingface checkpoints using aria2c if possible
@@ -703,7 +703,7 @@ def applyoutputformatting(txt, no_sentence_trimming=False, no_single_line=False)
txt = replaceblanklines(txt)
# trim off starting new lines in replies if we're in chat mode
if koboldai_vars.chatmode and txt[0] == "\n":
if koboldai_vars.chatmode and txt and txt[0] == "\n":
txt = txt[1:]
# Remove special characters