Update to GPTQ module 0.0.2, add support for upstream cuda quantizations, automatic detection

This commit is contained in:
0cc4m
2023-05-09 22:19:18 +02:00
parent 6121598142
commit a2d01bb9e4
6 changed files with 46 additions and 29 deletions

View File

@@ -1,3 +1,7 @@
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl</a>
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl</a>
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-06-2/gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl</a>
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl</a>
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl">gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl</a>
<a href="https://github.com/0cc4m/GPTQ-for-LLaMa/releases/download/2023-05-09/gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl">gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl</a>

View File

@@ -49,5 +49,5 @@ dependencies:
- diffusers
- git+https://github.com/0cc4m/hf_bleeding_edge/
- --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
- gptq_koboldai==0.0.1
- gptq_koboldai==0.0.2
- einops

View File

@@ -24,8 +24,8 @@ dependencies:
- Pillow
- psutil
- pip:
- --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
- torch==2.0.0+rocm5.4.2
- --extra-index-url https://download.pytorch.org/whl/rocm5.2
- torch==1.13.1+rocm5.2
- flask-cloudflared==0.0.10
- flask-ngrok
- flask-cors
@@ -44,5 +44,5 @@ dependencies:
- diffusers
- git+https://github.com/0cc4m/hf_bleeding_edge/
- --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html
- gptq_koboldai_rocm==0.0.1
- gptq_koboldai_rocm==0.0.2
- einops

View File

@@ -925,6 +925,7 @@ class story_settings(settings):
self.gptq_model = False
self.gptq_bits = -1
self.gptq_groupsize = -1
self.gptq_version = -1
self.gptq_file = None
self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled"))

View File

@@ -181,7 +181,8 @@ class HFInferenceModel(InferenceModel):
if "gptq_bits" in dir(self.model_config):
utils.koboldai_vars.gptq_model = True
utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize
utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1
utils.koboldai_vars.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1
utils.koboldai_vars.gptq_file = None
else:
utils.koboldai_vars.gptq_model = False

View File

@@ -48,8 +48,7 @@ def prepare_4bit_load(modelpath):
return path_4bit, False
# Legacy format support
paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
paths_4bit = ["4bit*.safetensors", "4bit*.pt", "4bit-old.safetensors", "4bit-old.pt"]
result = False
groupsize = -1
for p in paths_4bit:
@@ -59,26 +58,11 @@ def prepare_4bit_load(modelpath):
result = val[0]
fname = Path(result).parts[-1]
g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
groupsize = -1
if g:
groupsize = int(g[0])
break
if not result:
print("4-bit file not found, falling back to old format.")
for p in paths_4bit_old:
p = os.path.join(modelpath, p)
if os.path.isfile(p):
result = p
break
if not result:
print("4-bit old-format file not found, loading failed.")
raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
gptq.modelutils.set_gptq_version(0)
else:
gptq.modelutils.set_gptq_version(1)
return result, groupsize
@@ -103,6 +87,7 @@ def load_model_gptq_settings():
safetensors_file = os.path.join(utils.koboldai_vars.custmodpth, "model.safetensors")
pt_file = os.path.join(utils.koboldai_vars.custmodpth, "model.ckpt")
utils.koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file
utils.koboldai_vars.gptq_version = js.get("gptq_version", -1)
elif gptq_legacy_files:
utils.koboldai_vars.gptq_model = True
utils.koboldai_vars.gptq_bits = 4
@@ -110,10 +95,37 @@ def load_model_gptq_settings():
fname = Path(utils.koboldai_vars.gptq_file).parts[-1]
g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
utils.koboldai_vars.gptq_groupsize = int(g[0]) if g else -1
utils.koboldai_vars.gptq_version = -1
else:
utils.koboldai_vars.gptq_model = False
def get_gptq_version(fpath):
v1_strings = ["zeros", "scales", "bias", "qweight"]
v2_strings = ["qzeros", "scales", "bias", "qweight"]
v3_strings = ["qzeros", "scales", "g_idx", "qweight"]
with open(fpath, "rb") as f:
data = str(f.read(1024*1024))
v0 = all([s in data for s in v1_strings]) and not "qzeros" in data
v1 = all([s in data for s in v2_strings])
v2 = all([s in data for s in v3_strings])
if v2:
if v0 or v1:
logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}")
return 2
if v1:
if v0 or v2:
logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}")
return 1
if v0:
if v1 or v2:
logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}")
return 0
class HFTorch4BitInferenceModel(HFTorchInferenceModel):
def _load(self, save_model: bool, initial_load: bool) -> None:
utils.koboldai_vars.allowsp = True
@@ -140,9 +152,6 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
except ValueError:
self.gpu_layers_list = [utils.num_layers(self.model_config)]
if sum(self.gpu_layers_list) < utils.num_layers(self.model_config):
print("4-bit CPU offloader active")
tf_kwargs = {
"low_cpu_mem_usage": True,
}
@@ -351,12 +360,14 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
if utils.koboldai_vars.gptq_version < 0:
utils.koboldai_vars.gptq_version = get_gptq_version(path_4bit)
gptq.modelutils.set_gptq_version(utils.koboldai_vars.gptq_version)
if legacy_groupsize is not False:
groupsize = legacy_groupsize
print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")
logger.info(f"Using 4-bit file: {path_4bit}, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}")
if utils.koboldai_vars.model_type == "gptj":
model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list)
elif utils.koboldai_vars.model_type == "gpt_neox":