From a2d01bb9e454a0c951fc9c4c3e67599bcf188b5b Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 9 May 2023 22:19:18 +0200 Subject: [PATCH] Update to GPTQ module 0.0.2, add support for upstream cuda quantizations, automatic detection --- docs/gptq-whl-links.html | 4 ++ environments/huggingface.yml | 2 +- environments/rocm.yml | 6 +-- koboldai_settings.py | 1 + modeling/inference_models/hf.py | 3 +- modeling/inference_models/hf_torch_4bit.py | 59 +++++++++++++--------- 6 files changed, 46 insertions(+), 29 deletions(-) diff --git a/docs/gptq-whl-links.html b/docs/gptq-whl-links.html index 08cd0cd7..64d15d3d 100644 --- a/docs/gptq-whl-links.html +++ b/docs/gptq-whl-links.html @@ -1,3 +1,7 @@ gptq_koboldai-0.0.1-cp38-cp38-linux_x86_64.whl gptq_koboldai_rocm-0.0.1-cp38-cp38-linux_x86_64.whl gptq_koboldai-0.0.1-cp38-cp38-win_amd64.whl + +gptq_koboldai-0.0.2-cp38-cp38-linux_x86_64.whl +gptq_koboldai_rocm-0.0.2-cp38-cp38-linux_x86_64.whl +gptq_koboldai-0.0.2-cp38-cp38-win_amd64.whl diff --git a/environments/huggingface.yml b/environments/huggingface.yml index 12978b39..c381ea94 100644 --- a/environments/huggingface.yml +++ b/environments/huggingface.yml @@ -49,5 +49,5 @@ dependencies: - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - gptq_koboldai==0.0.1 + - gptq_koboldai==0.0.2 - einops diff --git a/environments/rocm.yml b/environments/rocm.yml index 0cb44eb1..4f6cfa11 100644 --- a/environments/rocm.yml +++ b/environments/rocm.yml @@ -24,8 +24,8 @@ dependencies: - Pillow - psutil - pip: - - --extra-index-url https://download.pytorch.org/whl/rocm5.4.2 - - torch==2.0.0+rocm5.4.2 + - --extra-index-url https://download.pytorch.org/whl/rocm5.2 + - torch==1.13.1+rocm5.2 - flask-cloudflared==0.0.10 - flask-ngrok - flask-cors @@ -44,5 +44,5 @@ dependencies: - diffusers - git+https://github.com/0cc4m/hf_bleeding_edge/ - --find-links=https://0cc4m.github.io/KoboldAI/gptq-whl-links.html - - gptq_koboldai_rocm==0.0.1 + - gptq_koboldai_rocm==0.0.2 - einops diff --git a/koboldai_settings.py b/koboldai_settings.py index 3e0fc48a..f0df2162 100644 --- a/koboldai_settings.py +++ b/koboldai_settings.py @@ -925,6 +925,7 @@ class story_settings(settings): self.gptq_model = False self.gptq_bits = -1 self.gptq_groupsize = -1 + self.gptq_version = -1 self.gptq_file = None self.save_paths = SavePaths(os.path.join("stories", self.story_name or "Untitled")) diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 5ee2abaa..7050f34e 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -181,7 +181,8 @@ class HFInferenceModel(InferenceModel): if "gptq_bits" in dir(self.model_config): utils.koboldai_vars.gptq_model = True utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits - utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize + utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize if getattr(self.model_config, "gptq_groupsize", False) else -1 + utils.koboldai_vars.gptq_version = self.model_config.gptq_version if getattr(self.model_config, "gptq_version", False) else 1 utils.koboldai_vars.gptq_file = None else: utils.koboldai_vars.gptq_model = False diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py index 350cd761..5917a43e 100644 --- a/modeling/inference_models/hf_torch_4bit.py +++ b/modeling/inference_models/hf_torch_4bit.py @@ -48,8 +48,7 @@ def prepare_4bit_load(modelpath): return path_4bit, False # Legacy format support - paths_4bit = ["4bit*.safetensors", "4bit*.pt"] - paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"] + paths_4bit = ["4bit*.safetensors", "4bit*.pt", "4bit-old.safetensors", "4bit-old.pt"] result = False groupsize = -1 for p in paths_4bit: @@ -59,26 +58,11 @@ def prepare_4bit_load(modelpath): result = val[0] fname = Path(result).parts[-1] g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) + groupsize = -1 if g: groupsize = int(g[0]) break - if not result: - print("4-bit file not found, falling back to old format.") - for p in paths_4bit_old: - p = os.path.join(modelpath, p) - if os.path.isfile(p): - result = p - break - - if not result: - print("4-bit old-format file not found, loading failed.") - raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.") - - gptq.modelutils.set_gptq_version(0) - else: - gptq.modelutils.set_gptq_version(1) - return result, groupsize @@ -103,6 +87,7 @@ def load_model_gptq_settings(): safetensors_file = os.path.join(utils.koboldai_vars.custmodpth, "model.safetensors") pt_file = os.path.join(utils.koboldai_vars.custmodpth, "model.ckpt") utils.koboldai_vars.gptq_file = safetensors_file if os.path.isfile(safetensors_file) else pt_file + utils.koboldai_vars.gptq_version = js.get("gptq_version", -1) elif gptq_legacy_files: utils.koboldai_vars.gptq_model = True utils.koboldai_vars.gptq_bits = 4 @@ -110,10 +95,37 @@ def load_model_gptq_settings(): fname = Path(utils.koboldai_vars.gptq_file).parts[-1] g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname) utils.koboldai_vars.gptq_groupsize = int(g[0]) if g else -1 + utils.koboldai_vars.gptq_version = -1 else: utils.koboldai_vars.gptq_model = False +def get_gptq_version(fpath): + v1_strings = ["zeros", "scales", "bias", "qweight"] + v2_strings = ["qzeros", "scales", "bias", "qweight"] + v3_strings = ["qzeros", "scales", "g_idx", "qweight"] + + with open(fpath, "rb") as f: + data = str(f.read(1024*1024)) + + v0 = all([s in data for s in v1_strings]) and not "qzeros" in data + v1 = all([s in data for s in v2_strings]) + v2 = all([s in data for s in v3_strings]) + + if v2: + if v0 or v1: + logger.warning(f"GPTQ model identified as v2, but v0={v0} and v1={v1}") + return 2 + if v1: + if v0 or v2: + logger.warning(f"GPTQ model identified as v1, but v0={v0} and v2={v2}") + return 1 + if v0: + if v1 or v2: + logger.warning(f"GPTQ model identified as v0, but v1={v1} and v2={v2}") + return 0 + + class HFTorch4BitInferenceModel(HFTorchInferenceModel): def _load(self, save_model: bool, initial_load: bool) -> None: utils.koboldai_vars.allowsp = True @@ -140,9 +152,6 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): except ValueError: self.gpu_layers_list = [utils.num_layers(self.model_config)] - if sum(self.gpu_layers_list) < utils.num_layers(self.model_config): - print("4-bit CPU offloader active") - tf_kwargs = { "low_cpu_mem_usage": True, } @@ -351,12 +360,14 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel): path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth) + if utils.koboldai_vars.gptq_version < 0: + utils.koboldai_vars.gptq_version = get_gptq_version(path_4bit) + gptq.modelutils.set_gptq_version(utils.koboldai_vars.gptq_version) + if legacy_groupsize is not False: groupsize = legacy_groupsize - print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}") - - print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit") + logger.info(f"Using 4-bit file: {path_4bit}, type {utils.koboldai_vars.model_type}, version {utils.koboldai_vars.gptq_version}, groupsize {groupsize}") if utils.koboldai_vars.model_type == "gptj": model = load_quant_offload(gptj_load_quant, utils.koboldai_vars.custmodpth, path_4bit, 4, groupsize, self.gpu_layers_list) elif utils.koboldai_vars.model_type == "gpt_neox":