Work on model download support

2025-06-05 21:59:24 +02:00 · 2023-05-02 21:32:20 +02:00
parent f83a0aa122
commit 9c3d578d6c
4 changed files with 98 additions and 33 deletions
--- a/aiserver.py
+++ b/aiserver.py
@@ -50,6 +50,8 @@ import multiprocessing
 import numpy as np
 from collections import OrderedDict
 from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type
 import glob
 from pathlib import Path
 import requests
 import html
@@ -86,18 +88,6 @@ allowed_ips = set()  # empty set
 enable_whitelist = False
 # 4-bit dependencies
 from pathlib import Path
 import glob
 sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
 from gptj import load_quant as gptj_load_quant
 from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
 from opt import load_quant as opt_load_quant
 from offload import load_quant_offload
 monkey_patched_4bit = False
 if lupa.LUA_VERSION[:2] != (5, 4):
    logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
@@ -1974,6 +1964,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
            except:
                pass
        if not koboldai_vars.gptq_model:
            # Run generic HF model load_config first to check what model it is
            from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
            model = GenericHFTorchInferenceModel(
                koboldai_vars.model,
                lazy_load=koboldai_vars.lazy_load,
                low_mem=args.lowmem
            )
            model.load_config()
        if koboldai_vars.gptq_model:
            from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel
            model = HFTorch4BitInferenceModel(
@@ -1981,14 +1981,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                lazy_load=koboldai_vars.lazy_load,
                low_mem=args.lowmem
            )
        else:
            from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
            model = GenericHFTorchInferenceModel(
                koboldai_vars.model,
                lazy_load=koboldai_vars.lazy_load,
                low_mem=args.lowmem
            )
        model.load(
            save_model=not (args.colab or args.cacheonly) or args.savemodel,
            initial_load=initial_load,
--- a/modeling/inference_models/generic_hf_torch.py
+++ b/modeling/inference_models/generic_hf_torch.py
@@ -24,7 +24,7 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel
 class GenericHFTorchInferenceModel(HFTorchInferenceModel):
-    def _load(self, save_model: bool, initial_load: bool) -> None:
+    def load_config(self) -> None:
        utils.koboldai_vars.allowsp = True
        # Make model path the same as the model name to make this consistent
@@ -50,6 +50,9 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
        self.init_model_config()
    def _load(self, save_model: bool, initial_load: bool) -> None:
        self.load_config()
        tf_kwargs = {
            "low_cpu_mem_usage": True,
        }
--- a/modeling/inference_models/hf.py
+++ b/modeling/inference_models/hf.py
@@ -61,6 +61,14 @@ class HFInferenceModel(InferenceModel):
                cache_dir="cache",
            )
            utils.koboldai_vars.model_type = self.model_config.model_type
            if "gptq_bits" in dir(self.model_config):
                utils.koboldai_vars.gptq_model = True
                utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
                utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize
                utils.koboldai_vars.gptq_file = None
            else:
                utils.koboldai_vars.gptq_model = False
        except ValueError:
            utils.koboldai_vars.model_type = {
                "NeoCustom": "gpt_neo",
--- a/modeling/inference_models/hf_torch_4bit.py
+++ b/modeling/inference_models/hf_torch_4bit.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import os
 import glob
 import json
 import torch
 import re
@@ -9,7 +10,6 @@ import sys
 from typing import Union
 from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
 from modeling.inference_model import SuperLegacyModelError
 import utils
 import modeling.lazy_loader as lazy_loader
@@ -33,6 +33,66 @@ from gptneox import load_quant as gptneox_load_quant
 from llama import load_quant as llama_load_quant
 from opt import load_quant as opt_load_quant
 from offload import load_quant_offload
 monkey_patched_4bit = False
 def prepare_4bit_load(modelpath):
    path_4bit = os.path.join(modelpath, "model.safetensors")
    if os.path.isfile(path_4bit):
        return path_4bit, False
    path_4bit = os.path.join(modelpath, "model.ckpt")
    if os.path.isfile(path_4bit):
        return path_4bit, False
    # Legacy format support
    paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
    paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
    result = False
    groupsize = -1
    for p in paths_4bit:
        p = os.path.join(modelpath, p)
        val = [v for v in glob.glob(p) if "4bit-old" not in v]
        if val:
            result = val[0]
            fname = Path(result).parts[-1]
            g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
            if g:
                groupsize = int(g[0])
            break
    global monkey_patched_4bit
    # Monkey-patch in old-format pt-file support
    if not result:
        print("4-bit file not found, falling back to old format.")
        for p in paths_4bit_old:
            p = os.path.join(modelpath, p)
            if os.path.isfile(p):
                result = p
                break
        if not result:
            print("4-bit old-format file not found, loading failed.")
            raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
        import llama, opt, gptneox, gptj, old_quant
        llama.make_quant = old_quant.old_make_quant
        opt.make_quant = old_quant.old_make_quant
        gptneox.make_quant = old_quant.old_make_quant
        gptj.make_quant = old_quant.old_make_quant
        monkey_patched_4bit = True
    elif monkey_patched_4bit:
        # Undo monkey patch
        print("Undoing 4-bit old format monkey patch")
        import llama, opt, gptneox, gptj, quant
        llama.make_quant = quant.make_quant
        opt.make_quant = quant.make_quant
        gptneox.make_quant = quant.make_quant
        gptj.make_quant = quant.make_quant
        monkey_patched_4bit = False
    return result, groupsize
 class HFTorch4BitInferenceModel(HFTorchInferenceModel):
@@ -87,17 +147,12 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
            ):
                try:
                    metamodel = AutoModelForCausalLM.from_config(self.model_config)
                    utils.layers_module_names = utils.get_layers_module_names(metamodel)
                    utils.module_names = list(metamodel.state_dict().keys())
                    utils.named_buffers = list(metamodel.named_buffers(recurse=True))
                except Exception as e:
-                    logger.error(f"Fell back to neo for metamodel due to {e}")
+                    logger.warning(f"Gave up on lazy loading due to {e}")
-                    try:
+                    self.lazy_load = False
                        metamodel = GPTNeoForCausalLM.from_config(self.model_config)
                    except Exception as e:
                        logger.error(f"Falling back again due to {e}")
                        raise SuperLegacyModelError
                utils.layers_module_names = utils.get_layers_module_names(metamodel)
                utils.module_names = list(metamodel.state_dict().keys())
                utils.named_buffers = list(metamodel.named_buffers(recurse=True))
        # Download model from Huggingface if it does not exist, otherwise load locally
        with self._maybe_use_float16(), lazy_loader.use_lazy_load(
@@ -276,8 +331,15 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
        utils.koboldai_vars.modeldim = self.get_hidden_size()
    def _get_model(self, location: str, tf_kwargs: Dict):
-        path_4bit = utils.koboldai_vars.gptq_file
+        if not utils.koboldai_vars.custmodpth:
            pass
        groupsize = utils.koboldai_vars.gptq_groupsize
        path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
        if legacy_groupsize is not False:
            groupsize = legacy_groupsize
        print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
        print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")