mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Work on model download support
This commit is contained in:
32
aiserver.py
32
aiserver.py
@@ -50,6 +50,8 @@ import multiprocessing
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type
|
from typing import Any, Callable, TypeVar, Tuple, Union, Dict, Set, List, Optional, Type
|
||||||
|
import glob
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import html
|
import html
|
||||||
@@ -86,18 +88,6 @@ allowed_ips = set() # empty set
|
|||||||
enable_whitelist = False
|
enable_whitelist = False
|
||||||
|
|
||||||
|
|
||||||
# 4-bit dependencies
|
|
||||||
from pathlib import Path
|
|
||||||
import glob
|
|
||||||
sys.path.insert(0, os.path.abspath(Path("repos/gptq")))
|
|
||||||
from gptj import load_quant as gptj_load_quant
|
|
||||||
from gptneox import load_quant as gptneox_load_quant
|
|
||||||
from llama import load_quant as llama_load_quant
|
|
||||||
from opt import load_quant as opt_load_quant
|
|
||||||
from offload import load_quant_offload
|
|
||||||
monkey_patched_4bit = False
|
|
||||||
|
|
||||||
|
|
||||||
if lupa.LUA_VERSION[:2] != (5, 4):
|
if lupa.LUA_VERSION[:2] != (5, 4):
|
||||||
logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
|
logger.error(f"Please install lupa==1.10. You have lupa {lupa.__version__}.")
|
||||||
|
|
||||||
@@ -1974,6 +1964,16 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
if not koboldai_vars.gptq_model:
|
||||||
|
# Run generic HF model load_config first to check what model it is
|
||||||
|
from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
|
||||||
|
model = GenericHFTorchInferenceModel(
|
||||||
|
koboldai_vars.model,
|
||||||
|
lazy_load=koboldai_vars.lazy_load,
|
||||||
|
low_mem=args.lowmem
|
||||||
|
)
|
||||||
|
model.load_config()
|
||||||
|
|
||||||
if koboldai_vars.gptq_model:
|
if koboldai_vars.gptq_model:
|
||||||
from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel
|
from modeling.inference_models.hf_torch_4bit import HFTorch4BitInferenceModel
|
||||||
model = HFTorch4BitInferenceModel(
|
model = HFTorch4BitInferenceModel(
|
||||||
@@ -1981,14 +1981,6 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
|
|||||||
lazy_load=koboldai_vars.lazy_load,
|
lazy_load=koboldai_vars.lazy_load,
|
||||||
low_mem=args.lowmem
|
low_mem=args.lowmem
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
from modeling.inference_models.generic_hf_torch import GenericHFTorchInferenceModel
|
|
||||||
model = GenericHFTorchInferenceModel(
|
|
||||||
koboldai_vars.model,
|
|
||||||
lazy_load=koboldai_vars.lazy_load,
|
|
||||||
low_mem=args.lowmem
|
|
||||||
)
|
|
||||||
|
|
||||||
model.load(
|
model.load(
|
||||||
save_model=not (args.colab or args.cacheonly) or args.savemodel,
|
save_model=not (args.colab or args.cacheonly) or args.savemodel,
|
||||||
initial_load=initial_load,
|
initial_load=initial_load,
|
||||||
|
@@ -24,7 +24,7 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel
|
|||||||
|
|
||||||
|
|
||||||
class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
||||||
def _load(self, save_model: bool, initial_load: bool) -> None:
|
def load_config(self) -> None:
|
||||||
utils.koboldai_vars.allowsp = True
|
utils.koboldai_vars.allowsp = True
|
||||||
|
|
||||||
# Make model path the same as the model name to make this consistent
|
# Make model path the same as the model name to make this consistent
|
||||||
@@ -50,6 +50,9 @@ class GenericHFTorchInferenceModel(HFTorchInferenceModel):
|
|||||||
|
|
||||||
self.init_model_config()
|
self.init_model_config()
|
||||||
|
|
||||||
|
def _load(self, save_model: bool, initial_load: bool) -> None:
|
||||||
|
self.load_config()
|
||||||
|
|
||||||
tf_kwargs = {
|
tf_kwargs = {
|
||||||
"low_cpu_mem_usage": True,
|
"low_cpu_mem_usage": True,
|
||||||
}
|
}
|
||||||
|
@@ -61,6 +61,14 @@ class HFInferenceModel(InferenceModel):
|
|||||||
cache_dir="cache",
|
cache_dir="cache",
|
||||||
)
|
)
|
||||||
utils.koboldai_vars.model_type = self.model_config.model_type
|
utils.koboldai_vars.model_type = self.model_config.model_type
|
||||||
|
|
||||||
|
if "gptq_bits" in dir(self.model_config):
|
||||||
|
utils.koboldai_vars.gptq_model = True
|
||||||
|
utils.koboldai_vars.gptq_bits = self.model_config.gptq_bits
|
||||||
|
utils.koboldai_vars.gptq_groupsize = self.model_config.gptq_groupsize
|
||||||
|
utils.koboldai_vars.gptq_file = None
|
||||||
|
else:
|
||||||
|
utils.koboldai_vars.gptq_model = False
|
||||||
except ValueError:
|
except ValueError:
|
||||||
utils.koboldai_vars.model_type = {
|
utils.koboldai_vars.model_type = {
|
||||||
"NeoCustom": "gpt_neo",
|
"NeoCustom": "gpt_neo",
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import glob
|
||||||
import json
|
import json
|
||||||
import torch
|
import torch
|
||||||
import re
|
import re
|
||||||
@@ -9,7 +10,6 @@ import sys
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
|
from transformers import AutoModelForCausalLM, GPTNeoForCausalLM, AutoTokenizer, LlamaTokenizer
|
||||||
from modeling.inference_model import SuperLegacyModelError
|
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
import modeling.lazy_loader as lazy_loader
|
import modeling.lazy_loader as lazy_loader
|
||||||
@@ -33,6 +33,66 @@ from gptneox import load_quant as gptneox_load_quant
|
|||||||
from llama import load_quant as llama_load_quant
|
from llama import load_quant as llama_load_quant
|
||||||
from opt import load_quant as opt_load_quant
|
from opt import load_quant as opt_load_quant
|
||||||
from offload import load_quant_offload
|
from offload import load_quant_offload
|
||||||
|
monkey_patched_4bit = False
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_4bit_load(modelpath):
|
||||||
|
path_4bit = os.path.join(modelpath, "model.safetensors")
|
||||||
|
if os.path.isfile(path_4bit):
|
||||||
|
return path_4bit, False
|
||||||
|
|
||||||
|
path_4bit = os.path.join(modelpath, "model.ckpt")
|
||||||
|
if os.path.isfile(path_4bit):
|
||||||
|
return path_4bit, False
|
||||||
|
|
||||||
|
# Legacy format support
|
||||||
|
paths_4bit = ["4bit*.safetensors", "4bit*.pt"]
|
||||||
|
paths_4bit_old = ["4bit-old.pt", "4bit-old.safetensors"]
|
||||||
|
result = False
|
||||||
|
groupsize = -1
|
||||||
|
for p in paths_4bit:
|
||||||
|
p = os.path.join(modelpath, p)
|
||||||
|
val = [v for v in glob.glob(p) if "4bit-old" not in v]
|
||||||
|
if val:
|
||||||
|
result = val[0]
|
||||||
|
fname = Path(result).parts[-1]
|
||||||
|
g = re.findall("^(?:4bit)(?:-)(\\d+)(?:g-?)", fname)
|
||||||
|
if g:
|
||||||
|
groupsize = int(g[0])
|
||||||
|
break
|
||||||
|
|
||||||
|
global monkey_patched_4bit
|
||||||
|
|
||||||
|
# Monkey-patch in old-format pt-file support
|
||||||
|
if not result:
|
||||||
|
print("4-bit file not found, falling back to old format.")
|
||||||
|
for p in paths_4bit_old:
|
||||||
|
p = os.path.join(modelpath, p)
|
||||||
|
if os.path.isfile(p):
|
||||||
|
result = p
|
||||||
|
break
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
print("4-bit old-format file not found, loading failed.")
|
||||||
|
raise RuntimeError("4-bit load failed. PT/Safetensors-File not found.")
|
||||||
|
|
||||||
|
import llama, opt, gptneox, gptj, old_quant
|
||||||
|
llama.make_quant = old_quant.old_make_quant
|
||||||
|
opt.make_quant = old_quant.old_make_quant
|
||||||
|
gptneox.make_quant = old_quant.old_make_quant
|
||||||
|
gptj.make_quant = old_quant.old_make_quant
|
||||||
|
monkey_patched_4bit = True
|
||||||
|
elif monkey_patched_4bit:
|
||||||
|
# Undo monkey patch
|
||||||
|
print("Undoing 4-bit old format monkey patch")
|
||||||
|
import llama, opt, gptneox, gptj, quant
|
||||||
|
llama.make_quant = quant.make_quant
|
||||||
|
opt.make_quant = quant.make_quant
|
||||||
|
gptneox.make_quant = quant.make_quant
|
||||||
|
gptj.make_quant = quant.make_quant
|
||||||
|
monkey_patched_4bit = False
|
||||||
|
|
||||||
|
return result, groupsize
|
||||||
|
|
||||||
|
|
||||||
class HFTorch4BitInferenceModel(HFTorchInferenceModel):
|
class HFTorch4BitInferenceModel(HFTorchInferenceModel):
|
||||||
@@ -87,17 +147,12 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
|
|||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
metamodel = AutoModelForCausalLM.from_config(self.model_config)
|
metamodel = AutoModelForCausalLM.from_config(self.model_config)
|
||||||
|
utils.layers_module_names = utils.get_layers_module_names(metamodel)
|
||||||
|
utils.module_names = list(metamodel.state_dict().keys())
|
||||||
|
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Fell back to neo for metamodel due to {e}")
|
logger.warning(f"Gave up on lazy loading due to {e}")
|
||||||
try:
|
self.lazy_load = False
|
||||||
metamodel = GPTNeoForCausalLM.from_config(self.model_config)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Falling back again due to {e}")
|
|
||||||
raise SuperLegacyModelError
|
|
||||||
|
|
||||||
utils.layers_module_names = utils.get_layers_module_names(metamodel)
|
|
||||||
utils.module_names = list(metamodel.state_dict().keys())
|
|
||||||
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
|
|
||||||
|
|
||||||
# Download model from Huggingface if it does not exist, otherwise load locally
|
# Download model from Huggingface if it does not exist, otherwise load locally
|
||||||
with self._maybe_use_float16(), lazy_loader.use_lazy_load(
|
with self._maybe_use_float16(), lazy_loader.use_lazy_load(
|
||||||
@@ -276,8 +331,15 @@ class HFTorch4BitInferenceModel(HFTorchInferenceModel):
|
|||||||
utils.koboldai_vars.modeldim = self.get_hidden_size()
|
utils.koboldai_vars.modeldim = self.get_hidden_size()
|
||||||
|
|
||||||
def _get_model(self, location: str, tf_kwargs: Dict):
|
def _get_model(self, location: str, tf_kwargs: Dict):
|
||||||
path_4bit = utils.koboldai_vars.gptq_file
|
if not utils.koboldai_vars.custmodpth:
|
||||||
|
pass
|
||||||
groupsize = utils.koboldai_vars.gptq_groupsize
|
groupsize = utils.koboldai_vars.gptq_groupsize
|
||||||
|
|
||||||
|
path_4bit, legacy_groupsize = prepare_4bit_load(utils.koboldai_vars.custmodpth)
|
||||||
|
|
||||||
|
if legacy_groupsize is not False:
|
||||||
|
groupsize = legacy_groupsize
|
||||||
|
|
||||||
print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
|
print(f"Using 4-bit file: {path_4bit}, groupsize {groupsize}")
|
||||||
|
|
||||||
print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")
|
print(f"Trying to load {utils.koboldai_vars.model_type} model in 4-bit")
|
||||||
|
Reference in New Issue
Block a user