Work on model download support

This commit is contained in:
0cc4m
2023-05-02 21:32:20 +02:00
parent f83a0aa122
commit 9c3d578d6c
4 changed files with 98 additions and 33 deletions

View File

@@ -50,6 +50,8 @@ import multiprocessing
import numpy as np
from collections import OrderedDict
from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type
import glob
from pathlib import Path
import requests
import html
@@ -86,18 +88,6 @@ allowed_ips = set() # empty set
enable_whitelist = False
# 4-bit dependencies
from pathlib import Path
import glob
sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
from gptj import load_quant as gptj_load_quant
from gptneox import load_quant as gptneox_load_quant
from llama import load_quant as llama_load_quant
from opt import load_quant as opt_load_quant
from offload import load_quant_offload
monkey_patched_4bit = False
if lupa.LUA_VERSION[:2] != (5, 4):
logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
@@ -1974,6 +1964,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
except:
pass
if not koboldai_vars.gptq_model:
# Run generic HF model load_config first to check what model it is
from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
model = GenericHFTorchInferenceModel(
koboldai_vars.model,
lazy_load=koboldai_vars.lazy_load,
low_mem=args.lowmem
)
model.load_config()
if koboldai_vars.gptq_model:
from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel
model = HFTorch4BitInferenceModel(
@@ -1981,14 +1981,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
lazy_load=koboldai_vars.lazy_load,
low_mem=args.lowmem
)
else:
from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
model = GenericHFTorchInferenceModel(
koboldai_vars.model,
lazy_load=koboldai_vars.lazy_load,
low_mem=args.lowmem
)
model.load(
save_model=not (args.colab or args.cacheonly) or args.savemodel,
initial_load=initial_load,

View File

@@ -24,7 +24,7 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel
class GenericHFTorchInferenceModel(HFTorchInferenceModel):
def _load(self, save_model: bool, initial_load: bool) -> None:
def load_config(self) -> None:
utils.koboldai_vars.allowsp = True
# Make model path the same as the model name to make this consistent
@@ -50,6 +50,9 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
self.init_model_config()
def _load(self, save_model: bool, initial_load: bool) -> None:
self.load_config()
tf_kwargs = {
"low_cpu_mem_usage": True,
}

View File

@@ -61,6 +61,14 @@ class HFInferenceModel(InferenceModel):
cache_dir="cache",
)
utils.koboldai_vars.model_type = self.model_config.model_type
if "gptq_bits" in dir(self.model_config):
utils.koboldai_vars.gptq_model = True
utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize
utils.koboldai_vars.gptq_file = None
else:
utils.koboldai_vars.gptq_model = False
except ValueError:
utils.koboldai_vars.model_type = {
"NeoCustom": "gpt_neo",

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import os
import glob
import json
import torch
import re
@@ -9,7 +10,6 @@ import sys
from typing import Union
from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
from modeling.inference_model import SuperLegacyModelError
import utils
import modeling.lazy_loader as lazy_loader
@@ -33,6 +33,66 @@ from gptneox import load_quant as gptneox_load_quant
from llama import load_quant as llama_load_quant
from opt import load_quant as opt_load_quant
from offload import load_quant_offload
monkey_patched_4bit = False
def prepare_4bit_load(modelpath):
path_4bit = os.path.join(modelpath, "model.safetensors")
if os.path.isfile(path_4bit):
return path_4bit, False
path_4bit = os.path.join(modelpath, "model.ckpt")
if os.path.isfile(path_4bit):
return path_4bit, False
# Legacy format support
paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
result = False
groupsize = -1
for p in paths_4bit:
p = os.path.join(modelpath, p)
val = [v for v in glob.glob(p) if "4bit-old" not in v]
if val:
result = val[0]
fname = Path(result).parts[-1]
g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
if g:
groupsize = int(g[0])
break
global monkey_patched_4bit
# Monkey-patch in old-format pt-file support
if not result:
print("4-bit file not found, falling back to old format.")
for p in paths_4bit_old:
p = os.path.join(modelpath, p)
if os.path.isfile(p):
result = p
break
if not result:
print("4-bit old-format file not found, loading failed.")
raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
import llama, opt, gptneox, gptj, old_quant
llama.make_quant = old_quant.old_make_quant
opt.make_quant = old_quant.old_make_quant
gptneox.make_quant = old_quant.old_make_quant
gptj.make_quant = old_quant.old_make_quant
monkey_patched_4bit = True
elif monkey_patched_4bit:
# Undo monkey patch
print("Undoing 4-bit old format monkey patch")
import llama, opt, gptneox, gptj, quant
llama.make_quant = quant.make_quant
opt.make_quant = quant.make_quant
gptneox.make_quant = quant.make_quant
gptj.make_quant = quant.make_quant
monkey_patched_4bit = False
return result, groupsize
class HFTorch4BitInferenceModel(HFTorchInferenceModel):
@@ -87,17 +147,12 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
):
try:
metamodel = AutoModelForCausalLM.from_config(self.model_config)
utils.layers_module_names = utils.get_layers_module_names(metamodel)
utils.module_names = list(metamodel.state_dict().keys())
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
except Exception as e:
logger.error(f"Fell back to neo for metamodel due to {e}")
try:
metamodel = GPTNeoForCausalLM.from_config(self.model_config)
except Exception as e:
logger.error(f"Falling back again due to {e}")
raise SuperLegacyModelError
utils.layers_module_names = utils.get_layers_module_names(metamodel)
utils.module_names = list(metamodel.state_dict().keys())
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
logger.warning(f"Gave up on lazy loading due to {e}")
self.lazy_load = False
# Download model from Huggingface if it does not exist, otherwise load locally
with self._maybe_use_float16(), lazy_loader.use_lazy_load(
@@ -276,8 +331,15 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
utils.koboldai_vars.modeldim = self.get_hidden_size()
def _get_model(self, location: str, tf_kwargs: Dict):
path_4bit = utils.koboldai_vars.gptq_file
if not utils.koboldai_vars.custmodpth:
pass
groupsize = utils.koboldai_vars.gptq_groupsize
path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
if legacy_groupsize is not False:
groupsize = legacy_groupsize
print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")