Lazy loader now attempts to pin layers if accelerate is enabled

This commit is contained in:
Gnome Ann 2022-06-19 16:35:23 -04:00
parent 042cf3e560
commit 26c319519e
1 changed files with 13 additions and 4 deletions

View File

@ -1668,19 +1668,19 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
else: else:
ram_blocks = gpu_blocks = cumulative_gpu_blocks = None ram_blocks = gpu_blocks = cumulative_gpu_blocks = None
def lazy_load_callback(model_dict, f, **_): def lazy_load_callback(model_dict: Dict[str, torch.Tensor], f, **_):
if lazy_load_callback.nested: if lazy_load_callback.nested:
return return
lazy_load_callback.nested = True lazy_load_callback.nested = True
device_map = {} device_map: Dict[str, Union[str, int]] = {}
for key, value in model_dict.items(): for key, value in model_dict.items():
if isinstance(value, torch_lazy_loader.LazyTensor) and not any(key.startswith(n) or key.startswith(n.split(".", 1)[1]) for n in vars.layers_module_names): if isinstance(value, torch_lazy_loader.LazyTensor) and not any(key.startswith(n) or key.startswith(n.split(".", 1)[1]) for n in vars.layers_module_names):
device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu"
else: else:
layer = int(max((n for n in vars.layers_module_names if key.startswith(n) or key.startswith(n.split(".", 1)[1])), key=len).rsplit(".", 1)[1]) layer = int(max((n for n in vars.layers_module_names if key.startswith(n) or key.startswith(n.split(".", 1)[1])), key=len).rsplit(".", 1)[1])
device = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel or layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) device = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel else "shared" if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
device_map[key] = device device_map[key] = device
if utils.num_shards is None or utils.current_shard == 0: if utils.num_shards is None or utils.current_shard == 0:
@ -1696,6 +1696,7 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
last_storage_key = None last_storage_key = None
f = None f = None
current_offset = 0 current_offset = 0
able_to_pin_layers = True
if utils.num_shards is not None: if utils.num_shards is not None:
utils.current_shard += 1 utils.current_shard += 1
for key in sorted(device_map.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)): for key in sorted(device_map.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)):
@ -1721,6 +1722,14 @@ def load_model(use_gpu=True, gpu_layers=None, initial_load=False, online_model="
model_dict[key] = model_dict[key].to(torch.float16) model_dict[key] = model_dict[key].to(torch.float16)
if not vars.usegpu and not vars.breakmodel and model_dict[key].dtype is torch.float16: if not vars.usegpu and not vars.breakmodel and model_dict[key].dtype is torch.float16:
model_dict[key] = model_dict[key].to(torch.float32) model_dict[key] = model_dict[key].to(torch.float32)
if device == "shared":
model_dict[key] = model_dict[key].to("cpu").detach_()
if able_to_pin_layers and utils.HAS_ACCELERATE:
try:
model_dict[key] = model_dict[key].pin_memory()
except:
able_to_pin_layers = False
else:
model_dict[key] = model_dict[key].to(device) model_dict[key] = model_dict[key].to(device)
#print("OK", flush=True) #print("OK", flush=True)
current_offset += nbytes current_offset += nbytes