From 9c3d578d6c3449f951e97be06b67bc7b84eff0ba Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 2 May 2023 21:32:20 +0200
Subject: [PATCH] Work on model download support

---
 aiserver.py                                   | 32 +++----
 modeling/inference_models/generic_hf_torch.py |  5 +-
 modeling/inference_models/hf.py               |  8 ++
 modeling/inference_models/hf_torch_4bit.py    | 86 ++++++++++++++++---
 4 files changed, 98 insertions(+), 33 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index 48e70854..81bb900f 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -50,6 +50,8 @@ import multiprocessing
 import numpy as np
 from collections import OrderedDict
 from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type
+import glob
+from pathlib import Path
 
 import requests
 import html
@@ -86,18 +88,6 @@ allowed_ips = set()  # empty set
 enable_whitelist = False
 
 
-# 4-bit dependencies
-from pathlib import Path
-import glob
-sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
-from gptj import load_quant as gptj_load_quant
-from gptneox import load_quant as gptneox_load_quant
-from llama import load_quant as llama_load_quant
-from opt import load_quant as opt_load_quant
-from offload import load_quant_offload
-monkey_patched_4bit = False
-
-
 if lupa.LUA_VERSION[:2] != (5, 4):
     logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
 
@@ -1974,6 +1964,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
             except:
                 pass
 
+        if not koboldai_vars.gptq_model:
+            # Run generic HF model load_config first to check what model it is
+            from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
+            model = GenericHFTorchInferenceModel(
+                koboldai_vars.model,
+                lazy_load=koboldai_vars.lazy_load,
+                low_mem=args.lowmem
+            )
+            model.load_config()
+
         if koboldai_vars.gptq_model:
             from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel
             model = HFTorch4BitInferenceModel(
@@ -1981,14 +1981,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 lazy_load=koboldai_vars.lazy_load,
                 low_mem=args.lowmem
             )
-        else:
-            from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
-            model = GenericHFTorchInferenceModel(
-                koboldai_vars.model,
-                lazy_load=koboldai_vars.lazy_load,
-                low_mem=args.lowmem
-            )
-
         model.load(
             save_model=not (args.colab or args.cacheonly) or args.savemodel,
             initial_load=initial_load,
diff --git a/modeling/inference_models/generic_hf_torch.py b/modeling/inference_models/generic_hf_torch.py
index ce91b176..d45513aa 100644
--- a/modeling/inference_models/generic_hf_torch.py
+++ b/modeling/inference_models/generic_hf_torch.py
@@ -24,7 +24,7 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel
 
 
 class GenericHFTorchInferenceModel(HFTorchInferenceModel):
-    def _load(self, save_model: bool, initial_load: bool) -> None:
+    def load_config(self) -> None:
         utils.koboldai_vars.allowsp = True
 
         # Make model path the same as the model name to make this consistent
@@ -50,6 +50,9 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
 
         self.init_model_config()
 
+    def _load(self, save_model: bool, initial_load: bool) -> None:
+        self.load_config()
+
         tf_kwargs = {
             "low_cpu_mem_usage": True,
         }
diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py
index eae4bb2d..480da5d3 100644
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -61,6 +61,14 @@ class HFInferenceModel(InferenceModel):
                 cache_dir="cache",
             )
             utils.koboldai_vars.model_type = self.model_config.model_type
+
+            if "gptq_bits" in dir(self.model_config):
+                utils.koboldai_vars.gptq_model = True
+                utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
+                utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize
+                utils.koboldai_vars.gptq_file = None
+            else:
+                utils.koboldai_vars.gptq_model = False
         except ValueError:
             utils.koboldai_vars.model_type = {
                 "NeoCustom": "gpt_neo",
diff --git a/modeling/inference_models/hf_torch_4bit.py b/modeling/inference_models/hf_torch_4bit.py
index a0e89436..f0ff87b9 100644
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import glob
 import json
 import torch
 import re
@@ -9,7 +10,6 @@ import sys
 from typing import Union
 
 from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
-from modeling.inference_model import SuperLegacyModelError
 
 import utils
 import modeling.lazy_loader as lazy_loader
@@ -33,6 +33,66 @@ from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
 from opt import load_quant as opt_load_quant
 from offload import load_quant_offload
+monkey_patched_4bit = False
+
+
+def prepare_4bit_load(modelpath):
+    path_4bit = os.path.join(modelpath, "model.safetensors")
+    if os.path.isfile(path_4bit):
+        return path_4bit, False
+
+    path_4bit = os.path.join(modelpath, "model.ckpt")
+    if os.path.isfile(path_4bit):
+        return path_4bit, False
+
+    # Legacy format support
+    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
+    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
+    result = False
+    groupsize = -1
+    for p in paths_4bit:
+        p = os.path.join(modelpath, p)
+        val = [v for v in glob.glob(p) if "4bit-old" not in v]
+        if val:
+            result = val[0]
+            fname = Path(result).parts[-1]
+            g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
+            if g:
+                groupsize = int(g[0])
+            break
+
+    global monkey_patched_4bit
+
+    # Monkey-patch in old-format pt-file support
+    if not result:
+        print("4-bit file not found, falling back to old format.")
+        for p in paths_4bit_old:
+            p = os.path.join(modelpath, p)
+            if os.path.isfile(p):
+                result = p
+                break
+
+        if not result:
+            print("4-bit old-format file not found, loading failed.")
+            raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
+
+        import llama, opt, gptneox, gptj, old_quant
+        llama.make_quant = old_quant.old_make_quant
+        opt.make_quant = old_quant.old_make_quant
+        gptneox.make_quant = old_quant.old_make_quant
+        gptj.make_quant = old_quant.old_make_quant
+        monkey_patched_4bit = True
+    elif monkey_patched_4bit:
+        # Undo monkey patch
+        print("Undoing 4-bit old format monkey patch")
+        import llama, opt, gptneox, gptj, quant
+        llama.make_quant = quant.make_quant
+        opt.make_quant = quant.make_quant
+        gptneox.make_quant = quant.make_quant
+        gptj.make_quant = quant.make_quant
+        monkey_patched_4bit = False
+
+    return result, groupsize
 
 
 class HFTorch4BitInferenceModel(HFTorchInferenceModel):
@@ -87,17 +147,12 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
             ):
                 try:
                     metamodel = AutoModelForCausalLM.from_config(self.model_config)
+                    utils.layers_module_names = utils.get_layers_module_names(metamodel)
+                    utils.module_names = list(metamodel.state_dict().keys())
+                    utils.named_buffers = list(metamodel.named_buffers(recurse=True))
                 except Exception as e:
-                    logger.error(f"Fell back to neo for metamodel due to {e}")
-                    try:
-                        metamodel = GPTNeoForCausalLM.from_config(self.model_config)
-                    except Exception as e:
-                        logger.error(f"Falling back again due to {e}")
-                        raise SuperLegacyModelError
-
-                utils.layers_module_names = utils.get_layers_module_names(metamodel)
-                utils.module_names = list(metamodel.state_dict().keys())
-                utils.named_buffers = list(metamodel.named_buffers(recurse=True))
+                    logger.warning(f"Gave up on lazy loading due to {e}")
+                    self.lazy_load = False
 
         # Download model from Huggingface if it does not exist, otherwise load locally
         with self._maybe_use_float16(), lazy_loader.use_lazy_load(
@@ -276,8 +331,15 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
         utils.koboldai_vars.modeldim = self.get_hidden_size()
 
     def _get_model(self, location: str, tf_kwargs: Dict):
-        path_4bit = utils.koboldai_vars.gptq_file
+        if not utils.koboldai_vars.custmodpth:
+            pass
         groupsize = utils.koboldai_vars.gptq_groupsize
+
+        path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
+
+        if legacy_groupsize is not False:
+            groupsize = legacy_groupsize
+
         print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
 
         print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")