mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-02-20 05:30:57 +01:00
Lazy loader no longer requires map file except when loading to TPU
This commit is contained in:
parent
b0a01962ab
commit
5253cdcb36
30
aiserver.py
30
aiserver.py
@ -1652,18 +1652,14 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
|
|||||||
|
|
||||||
device_map = {}
|
device_map = {}
|
||||||
|
|
||||||
for _key, spec in lazy_load_spec.get("layer_weights", {}).items():
|
for key, value in model_dict.items():
|
||||||
for layer in range(n_layers):
|
if isinstance(value, torch_lazy_loader.LazyTensor) and not any(key.startswith(n) or key.startswith(n.split(".", 1)[1]) for n in vars.layer_param_names):
|
||||||
key = _key.format(layer=layer)
|
device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu"
|
||||||
if key not in model_dict:
|
else:
|
||||||
continue
|
layer = int(next(n for n in vars.layer_param_names if key.startswith(n) or key.startswith(n.split(".", 1)[1])).rsplit(".", 1)[1])
|
||||||
device = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel or layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
|
device = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel or layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
|
||||||
device_map[key] = device
|
device_map[key] = device
|
||||||
|
|
||||||
for key, value in model_dict.items():
|
|
||||||
if isinstance(value, torch_lazy_loader.LazyTensor) and key not in device_map:
|
|
||||||
device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu"
|
|
||||||
|
|
||||||
if utils.num_shards is None or utils.current_shard == 0:
|
if utils.num_shards is None or utils.current_shard == 0:
|
||||||
if utils.num_shards is not None:
|
if utils.num_shards is not None:
|
||||||
num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
|
num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
|
||||||
@ -1717,15 +1713,6 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
|
|||||||
lazy_load_callback.nested = False
|
lazy_load_callback.nested = False
|
||||||
return lazy_load_callback
|
return lazy_load_callback
|
||||||
|
|
||||||
lazy_load_config_path = os.path.join("maps", vars.model_type + ".json")
|
|
||||||
if(vars.lazy_load and "model_config" in globals() and os.path.isfile(lazy_load_config_path)):
|
|
||||||
with open(lazy_load_config_path) as f:
|
|
||||||
lazy_load_spec = json.load(f)
|
|
||||||
|
|
||||||
else:
|
|
||||||
vars.lazy_load = False
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_hidden_size_from_model(model):
|
def get_hidden_size_from_model(model):
|
||||||
try:
|
try:
|
||||||
@ -1800,6 +1787,13 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
|
|||||||
import shutil
|
import shutil
|
||||||
shutil.move(vars.model.replace('/', '_'), "models/{}".format(vars.model.replace('/', '_')))
|
shutil.move(vars.model.replace('/', '_'), "models/{}".format(vars.model.replace('/', '_')))
|
||||||
print("\n", flush=True)
|
print("\n", flush=True)
|
||||||
|
if(vars.lazy_load): # If we're using lazy loader, we need to figure out what the model's hidden layers are called
|
||||||
|
with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True):
|
||||||
|
try:
|
||||||
|
metamodel = AutoModelForCausalLM.from_config(model_config)
|
||||||
|
except Exception as e:
|
||||||
|
metamodel = GPTNeoForCausalLM.from_config(model_config)
|
||||||
|
vars.layer_param_names = utils.get_layer_param_names(metamodel)
|
||||||
with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if vars.lazy_load else None, dematerialized_modules=True):
|
with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=vars.lazy_load, callback=get_lazy_load_callback(utils.num_layers(model_config)) if vars.lazy_load else None, dematerialized_modules=True):
|
||||||
if(vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
if(vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
||||||
lowmem = {}
|
lowmem = {}
|
||||||
|
20
utils.py
20
utils.py
@ -7,11 +7,19 @@ import tempfile
|
|||||||
import requests
|
import requests
|
||||||
import requests.adapters
|
import requests.adapters
|
||||||
import time
|
import time
|
||||||
|
from transformers import __version__ as transformers_version
|
||||||
|
import packaging.version
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
import os
|
import os
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
HAS_ACCELERATE = packaging.version.parse(transformers_version) >= packaging.version.parse("4.20.0.dev0")
|
||||||
|
try:
|
||||||
|
import accelerate
|
||||||
|
except ImportError:
|
||||||
|
HAS_ACCELERATE = False
|
||||||
|
|
||||||
vars = None
|
vars = None
|
||||||
num_shards: Optional[int] = None
|
num_shards: Optional[int] = None
|
||||||
current_shard = 0
|
current_shard = 0
|
||||||
@ -300,3 +308,15 @@ def get_sharded_checkpoint_num_tensors(pretrained_model_name_or_path, filename,
|
|||||||
import torch
|
import torch
|
||||||
shard_paths, _ = transformers.modeling_utils.get_checkpoint_shard_files(pretrained_model_name_or_path, filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, use_auth_token=use_auth_token, user_agent=user_agent, revision=revision, mirror=mirror)
|
shard_paths, _ = transformers.modeling_utils.get_checkpoint_shard_files(pretrained_model_name_or_path, filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, use_auth_token=use_auth_token, user_agent=user_agent, revision=revision, mirror=mirror)
|
||||||
return list(itertools.chain(*(torch.load(p, map_location="cpu").keys() for p in shard_paths)))
|
return list(itertools.chain(*(torch.load(p, map_location="cpu").keys() for p in shard_paths)))
|
||||||
|
|
||||||
|
def get_layer_param_names(model):
|
||||||
|
names = []
|
||||||
|
def recurse(module, head=""):
|
||||||
|
for c in module.named_children():
|
||||||
|
name = head + c[0]
|
||||||
|
if c[0].isnumeric() and any(c[1].__class__.__name__.endswith(suffix) for suffix in ("Block", "Layer")):
|
||||||
|
names.append(name)
|
||||||
|
else:
|
||||||
|
recurse(c[1], head=name + ".")
|
||||||
|
recurse(model)
|
||||||
|
return names
|
||||||
|
Loading…
x
Reference in New Issue
Block a user