mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Remove unused file open
This commit is contained in:
@@ -1171,125 +1171,118 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword
|
|||||||
if callback.nested:
|
if callback.nested:
|
||||||
return
|
return
|
||||||
callback.nested = True
|
callback.nested = True
|
||||||
with zipfile.ZipFile(f, "r") as z:
|
try:
|
||||||
try:
|
if utils.current_shard == 0:
|
||||||
last_storage_key = None
|
print("\n\n\nThis model has ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), " parameters.\n")
|
||||||
zipfolder = os.path.basename(os.path.normpath(f)).split('.')[0]
|
|
||||||
f = None
|
|
||||||
current_offset = 0
|
|
||||||
if utils.current_shard == 0:
|
|
||||||
print("\n\n\nThis model has ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), " parameters.\n")
|
|
||||||
|
|
||||||
if utils.num_shards is None or utils.current_shard == 0:
|
|
||||||
if utils.num_shards is not None:
|
|
||||||
num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
|
|
||||||
else:
|
|
||||||
num_tensors = len(model_dict)
|
|
||||||
|
|
||||||
if socketio is None:
|
|
||||||
utils.bar = tqdm(total=num_tensors, desc="Loading model tensors")
|
|
||||||
else:
|
|
||||||
utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit))
|
|
||||||
koboldai_vars.status_message = "Loading model"
|
|
||||||
koboldai_vars.loaded_layers = 0
|
|
||||||
koboldai_vars.total_layers = num_tensors
|
|
||||||
|
|
||||||
|
if utils.num_shards is None or utils.current_shard == 0:
|
||||||
if utils.num_shards is not None:
|
if utils.num_shards is not None:
|
||||||
utils.current_shard += 1
|
num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
|
||||||
|
else:
|
||||||
|
num_tensors = len(model_dict)
|
||||||
|
|
||||||
for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)):
|
if socketio is None:
|
||||||
model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None)
|
utils.bar = tqdm(total=num_tensors, desc="Loading model tensors")
|
||||||
|
else:
|
||||||
|
utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit))
|
||||||
|
koboldai_vars.status_message = "Loading model"
|
||||||
|
koboldai_vars.loaded_layers = 0
|
||||||
|
koboldai_vars.total_layers = num_tensors
|
||||||
|
|
||||||
# Some model weights are used by transformers but not by MTJ.
|
if utils.num_shards is not None:
|
||||||
# We have to materialize these weights anyways because
|
utils.current_shard += 1
|
||||||
# transformers will throw a tantrum otherwise. To attain
|
|
||||||
# the least possible memory usage, we create them as meta
|
|
||||||
# tensors, which don't take up any actual CPU or TPU memory.
|
|
||||||
if model_spec_key is None:
|
|
||||||
model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta")
|
|
||||||
utils.bar.update(1)
|
|
||||||
koboldai_vars.loaded_layers += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
spec = model_spec[model_spec_key]
|
for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)):
|
||||||
transforms = set(spec.get("transforms", ()))
|
model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None)
|
||||||
|
|
||||||
if not isinstance(model_dict[key], lazy_loader.LazyTensor):
|
# Some model weights are used by transformers but not by MTJ.
|
||||||
error = f"Duplicate key {repr(key)}"
|
# We have to materialize these weights anyways because
|
||||||
print("\n\nERROR: " + error, file=sys.stderr)
|
# transformers will throw a tantrum otherwise. To attain
|
||||||
raise RuntimeError(error)
|
# the least possible memory usage, we create them as meta
|
||||||
|
# tensors, which don't take up any actual CPU or TPU memory.
|
||||||
tensor = model_dict[key].materialize(map_location="cpu")
|
if model_spec_key is None:
|
||||||
model_dict[key] = tensor.to("meta")
|
model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta")
|
||||||
|
|
||||||
# MTJ requires certain mathematical operations to be performed
|
|
||||||
# on tensors in order for them to be in the correct format
|
|
||||||
if "remove_first_two_rows" in transforms:
|
|
||||||
tensor = tensor[2:]
|
|
||||||
if "divide_by_shards" in transforms:
|
|
||||||
tensor /= params["cores_per_replica"]
|
|
||||||
if "vocab_pad" in transforms:
|
|
||||||
tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],))
|
|
||||||
# We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config
|
|
||||||
#if "no_transpose" not in transforms and tensor.ndim == 2:
|
|
||||||
# tensor = tensor.T
|
|
||||||
tensor.unsqueeze_(0)
|
|
||||||
|
|
||||||
|
|
||||||
# Shard the tensor so that parts of the tensor can be used
|
|
||||||
# on different TPU cores
|
|
||||||
tensor = reshard_reverse(
|
|
||||||
tensor,
|
|
||||||
params["cores_per_replica"],
|
|
||||||
network.state["params"][spec["module"]][spec["param"]].shape,
|
|
||||||
)
|
|
||||||
tensor = tensor.detach()
|
|
||||||
# numpy does not support bfloat16
|
|
||||||
if tensor.dtype is torch.bfloat16:
|
|
||||||
tensor = tensor.to(torch.float32)
|
|
||||||
tensor = jnp.array(tensor)
|
|
||||||
if tensor.dtype is torch.float16 or tensor.dtype is torch.float32:
|
|
||||||
tensor = tensor.bfloat16()
|
|
||||||
network.state["params"][spec["module"]][spec["param"]] = move_xmap(
|
|
||||||
tensor,
|
|
||||||
np.empty(params["cores_per_replica"]),
|
|
||||||
)
|
|
||||||
|
|
||||||
koboldai_vars.loaded_layers += 1
|
|
||||||
try:
|
|
||||||
time.sleep(0.01)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
utils.bar.update(1)
|
utils.bar.update(1)
|
||||||
|
koboldai_vars.loaded_layers += 1
|
||||||
|
continue
|
||||||
|
|
||||||
if utils.num_shards is not None and utils.current_shard < utils.num_shards:
|
spec = model_spec[model_spec_key]
|
||||||
return
|
transforms = set(spec.get("transforms", ()))
|
||||||
|
|
||||||
# Check for tensors that MTJ needs that were not provided in the
|
if not isinstance(model_dict[key], lazy_loader.LazyTensor):
|
||||||
# HF model
|
error = f"Duplicate key {repr(key)}"
|
||||||
for mk, mv in network.state["params"].items():
|
print("\n\nERROR: " + error, file=sys.stderr)
|
||||||
for pk, pv in mv.items():
|
raise RuntimeError(error)
|
||||||
if isinstance(pv, PlaceholderTensor):
|
|
||||||
# The transformers GPT-J models apparently do not
|
|
||||||
# have embedding bias, whereas MTJ GPT-J models do,
|
|
||||||
# so we have to supplement an embedding bias tensor
|
|
||||||
# by creating a tensor with the necessary shape, filled
|
|
||||||
# with zeros.
|
|
||||||
if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b":
|
|
||||||
mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"]))
|
|
||||||
|
|
||||||
else:
|
tensor = model_dict[key].materialize(map_location="cpu")
|
||||||
error = f"{mk} {pk} could not be found in the model checkpoint"
|
model_dict[key] = tensor.to("meta")
|
||||||
print("\n\nERROR: " + error, file=sys.stderr)
|
|
||||||
raise RuntimeError(error)
|
# MTJ requires certain mathematical operations to be performed
|
||||||
finally:
|
# on tensors in order for them to be in the correct format
|
||||||
if utils.num_shards is None or utils.current_shard >= utils.num_shards:
|
if "remove_first_two_rows" in transforms:
|
||||||
utils.bar.close()
|
tensor = tensor[2:]
|
||||||
utils.bar = None
|
if "divide_by_shards" in transforms:
|
||||||
koboldai_vars.status_message = ""
|
tensor /= params["cores_per_replica"]
|
||||||
callback.nested = False
|
if "vocab_pad" in transforms:
|
||||||
if isinstance(f, zipfile.ZipExtFile):
|
tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],))
|
||||||
f.close()
|
# We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config
|
||||||
|
#if "no_transpose" not in transforms and tensor.ndim == 2:
|
||||||
|
# tensor = tensor.T
|
||||||
|
tensor.unsqueeze_(0)
|
||||||
|
|
||||||
|
|
||||||
|
# Shard the tensor so that parts of the tensor can be used
|
||||||
|
# on different TPU cores
|
||||||
|
tensor = reshard_reverse(
|
||||||
|
tensor,
|
||||||
|
params["cores_per_replica"],
|
||||||
|
network.state["params"][spec["module"]][spec["param"]].shape,
|
||||||
|
)
|
||||||
|
tensor = tensor.detach()
|
||||||
|
# numpy does not support bfloat16
|
||||||
|
if tensor.dtype is torch.bfloat16:
|
||||||
|
tensor = tensor.to(torch.float32)
|
||||||
|
tensor = jnp.array(tensor)
|
||||||
|
if tensor.dtype is torch.float16 or tensor.dtype is torch.float32:
|
||||||
|
tensor = tensor.bfloat16()
|
||||||
|
network.state["params"][spec["module"]][spec["param"]] = move_xmap(
|
||||||
|
tensor,
|
||||||
|
np.empty(params["cores_per_replica"]),
|
||||||
|
)
|
||||||
|
|
||||||
|
koboldai_vars.loaded_layers += 1
|
||||||
|
try:
|
||||||
|
time.sleep(0.01)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
utils.bar.update(1)
|
||||||
|
|
||||||
|
if utils.num_shards is not None and utils.current_shard < utils.num_shards:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check for tensors that MTJ needs that were not provided in the
|
||||||
|
# HF model
|
||||||
|
for mk, mv in network.state["params"].items():
|
||||||
|
for pk, pv in mv.items():
|
||||||
|
if isinstance(pv, PlaceholderTensor):
|
||||||
|
# The transformers GPT-J models apparently do not
|
||||||
|
# have embedding bias, whereas MTJ GPT-J models do,
|
||||||
|
# so we have to supplement an embedding bias tensor
|
||||||
|
# by creating a tensor with the necessary shape, filled
|
||||||
|
# with zeros.
|
||||||
|
if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b":
|
||||||
|
mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"]))
|
||||||
|
|
||||||
|
else:
|
||||||
|
error = f"{mk} {pk} could not be found in the model checkpoint"
|
||||||
|
print("\n\nERROR: " + error, file=sys.stderr)
|
||||||
|
raise RuntimeError(error)
|
||||||
|
finally:
|
||||||
|
if utils.num_shards is None or utils.current_shard >= utils.num_shards:
|
||||||
|
utils.bar.close()
|
||||||
|
utils.bar = None
|
||||||
|
koboldai_vars.status_message = ""
|
||||||
|
callback.nested = False
|
||||||
callback.nested = False
|
callback.nested = False
|
||||||
|
|
||||||
if os.path.isdir(koboldai_vars.model.replace('/', '_')):
|
if os.path.isdir(koboldai_vars.model.replace('/', '_')):
|
||||||
|
Reference in New Issue
Block a user