Work on model download support

This commit is contained in:
0cc4m
2023-05-02 21:32:20 +02:00
parent f83a0aa122
commit 9c3d578d6c
4 changed files with 98 additions and 33 deletions

View File

@@ -50,6 +50,8 @@ import multiprocessing
import numpy as np import numpy as np
from collections import OrderedDict from collections import OrderedDict
from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type
import glob
from pathlib import Path
import requests import requests
import html import html
@@ -86,18 +88,6 @@ allowed_ips = set() # empty set
enable_whitelist = False enable_whitelist = False
# 4-bit dependencies
from pathlib import Path
import glob
sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
from gptj import load_quant as gptj_load_quant
from gptneox import load_quant as gptneox_load_quant
from llama import load_quant as llama_load_quant
from opt import load_quant as opt_load_quant
from offload import load_quant_offload
monkey_patched_4bit = False
if lupa.LUA_VERSION[:2] != (5, 4): if lupa.LUA_VERSION[:2] != (5, 4):
logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.") logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
@@ -1974,6 +1964,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
except: except:
pass pass
if not koboldai_vars.gptq_model:
# Run generic HF model load_config first to check what model it is
from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
model = GenericHFTorchInferenceModel(
koboldai_vars.model,
lazy_load=koboldai_vars.lazy_load,
low_mem=args.lowmem
)
model.load_config()
if koboldai_vars.gptq_model: if koboldai_vars.gptq_model:
from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel
model = HFTorch4BitInferenceModel( model = HFTorch4BitInferenceModel(
@@ -1981,14 +1981,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
lazy_load=koboldai_vars.lazy_load, lazy_load=koboldai_vars.lazy_load,
low_mem=args.lowmem low_mem=args.lowmem
) )
else:
from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
model = GenericHFTorchInferenceModel(
koboldai_vars.model,
lazy_load=koboldai_vars.lazy_load,
low_mem=args.lowmem
)
model.load( model.load(
save_model=not (args.colab or args.cacheonly) or args.savemodel, save_model=not (args.colab or args.cacheonly) or args.savemodel,
initial_load=initial_load, initial_load=initial_load,

View File

@@ -24,7 +24,7 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel
class GenericHFTorchInferenceModel(HFTorchInferenceModel): class GenericHFTorchInferenceModel(HFTorchInferenceModel):
def _load(self, save_model: bool, initial_load: bool) -> None: def load_config(self) -> None:
utils.koboldai_vars.allowsp = True utils.koboldai_vars.allowsp = True
# Make model path the same as the model name to make this consistent # Make model path the same as the model name to make this consistent
@@ -50,6 +50,9 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
self.init_model_config() self.init_model_config()
def _load(self, save_model: bool, initial_load: bool) -> None:
self.load_config()
tf_kwargs = { tf_kwargs = {
"low_cpu_mem_usage": True, "low_cpu_mem_usage": True,
} }

View File

@@ -61,6 +61,14 @@ class HFInferenceModel(InferenceModel):
cache_dir="cache", cache_dir="cache",
) )
utils.koboldai_vars.model_type = self.model_config.model_type utils.koboldai_vars.model_type = self.model_config.model_type
if "gptq_bits" in dir(self.model_config):
utils.koboldai_vars.gptq_model = True
utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize
utils.koboldai_vars.gptq_file = None
else:
utils.koboldai_vars.gptq_model = False
except ValueError: except ValueError:
utils.koboldai_vars.model_type = { utils.koboldai_vars.model_type = {
"NeoCustom": "gpt_neo", "NeoCustom": "gpt_neo",

View File

@@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
import os import os
import glob
import json import json
import torch import torch
import re import re
@@ -9,7 +10,6 @@ import sys
from typing import Union from typing import Union
from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
from modeling.inference_model import SuperLegacyModelError
import utils import utils
import modeling.lazy_loader as lazy_loader import modeling.lazy_loader as lazy_loader
@@ -33,6 +33,66 @@ from gptneox import load_quant as gptneox_load_quant
from llama import load_quant as llama_load_quant from llama import load_quant as llama_load_quant
from opt import load_quant as opt_load_quant from opt import load_quant as opt_load_quant
from offload import load_quant_offload from offload import load_quant_offload
monkey_patched_4bit = False
def prepare_4bit_load(modelpath):
path_4bit = os.path.join(modelpath, "model.safetensors")
if os.path.isfile(path_4bit):
return path_4bit, False
path_4bit = os.path.join(modelpath, "model.ckpt")
if os.path.isfile(path_4bit):
return path_4bit, False
# Legacy format support
paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
result = False
groupsize = -1
for p in paths_4bit:
p = os.path.join(modelpath, p)
val = [v for v in glob.glob(p) if "4bit-old" not in v]
if val:
result = val[0]
fname = Path(result).parts[-1]
g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
if g:
groupsize = int(g[0])
break
global monkey_patched_4bit
# Monkey-patch in old-format pt-file support
if not result:
print("4-bit file not found, falling back to old format.")
for p in paths_4bit_old:
p = os.path.join(modelpath, p)
if os.path.isfile(p):
result = p
break
if not result:
print("4-bit old-format file not found, loading failed.")
raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
import llama, opt, gptneox, gptj, old_quant
llama.make_quant = old_quant.old_make_quant
opt.make_quant = old_quant.old_make_quant
gptneox.make_quant = old_quant.old_make_quant
gptj.make_quant = old_quant.old_make_quant
monkey_patched_4bit = True
elif monkey_patched_4bit:
# Undo monkey patch
print("Undoing 4-bit old format monkey patch")
import llama, opt, gptneox, gptj, quant
llama.make_quant = quant.make_quant
opt.make_quant = quant.make_quant
gptneox.make_quant = quant.make_quant
gptj.make_quant = quant.make_quant
monkey_patched_4bit = False
return result, groupsize
class HFTorch4BitInferenceModel(HFTorchInferenceModel): class HFTorch4BitInferenceModel(HFTorchInferenceModel):
@@ -87,17 +147,12 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
): ):
try: try:
metamodel = AutoModelForCausalLM.from_config(self.model_config) metamodel = AutoModelForCausalLM.from_config(self.model_config)
utils.layers_module_names = utils.get_layers_module_names(metamodel)
utils.module_names = list(metamodel.state_dict().keys())
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
except Exception as e: except Exception as e:
logger.error(f"Fell back to neo for metamodel due to {e}") logger.warning(f"Gave up on lazy loading due to {e}")
try: self.lazy_load = False
metamodel = GPTNeoForCausalLM.from_config(self.model_config)
except Exception as e:
logger.error(f"Falling back again due to {e}")
raise SuperLegacyModelError
utils.layers_module_names = utils.get_layers_module_names(metamodel)
utils.module_names = list(metamodel.state_dict().keys())
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
# Download model from Huggingface if it does not exist, otherwise load locally # Download model from Huggingface if it does not exist, otherwise load locally
with self._maybe_use_float16(), lazy_loader.use_lazy_load( with self._maybe_use_float16(), lazy_loader.use_lazy_load(
@@ -276,8 +331,15 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
utils.koboldai_vars.modeldim = self.get_hidden_size() utils.koboldai_vars.modeldim = self.get_hidden_size()
def _get_model(self, location: str, tf_kwargs: Dict): def _get_model(self, location: str, tf_kwargs: Dict):
path_4bit = utils.koboldai_vars.gptq_file if not utils.koboldai_vars.custmodpth:
pass
groupsize = utils.koboldai_vars.gptq_groupsize groupsize = utils.koboldai_vars.gptq_groupsize
path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
if legacy_groupsize is not False:
groupsize = legacy_groupsize
print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}") print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit") print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")