mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Remove unused file open
This commit is contained in:
@@ -1171,125 +1171,118 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword
|
||||
if callback.nested:
|
||||
return
|
||||
callback.nested = True
|
||||
with zipfile.ZipFile(f, "r") as z:
|
||||
try:
|
||||
last_storage_key = None
|
||||
zipfolder = os.path.basename(os.path.normpath(f)).split('.')[0]
|
||||
f = None
|
||||
current_offset = 0
|
||||
if utils.current_shard == 0:
|
||||
print("\n\n\nThis model has ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), " parameters.\n")
|
||||
|
||||
if utils.num_shards is None or utils.current_shard == 0:
|
||||
if utils.num_shards is not None:
|
||||
num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
|
||||
else:
|
||||
num_tensors = len(model_dict)
|
||||
|
||||
if socketio is None:
|
||||
utils.bar = tqdm(total=num_tensors, desc="Loading model tensors")
|
||||
else:
|
||||
utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit))
|
||||
koboldai_vars.status_message = "Loading model"
|
||||
koboldai_vars.loaded_layers = 0
|
||||
koboldai_vars.total_layers = num_tensors
|
||||
try:
|
||||
if utils.current_shard == 0:
|
||||
print("\n\n\nThis model has ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), " parameters.\n")
|
||||
|
||||
if utils.num_shards is None or utils.current_shard == 0:
|
||||
if utils.num_shards is not None:
|
||||
utils.current_shard += 1
|
||||
num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
|
||||
else:
|
||||
num_tensors = len(model_dict)
|
||||
|
||||
for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)):
|
||||
model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None)
|
||||
if socketio is None:
|
||||
utils.bar = tqdm(total=num_tensors, desc="Loading model tensors")
|
||||
else:
|
||||
utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit))
|
||||
koboldai_vars.status_message = "Loading model"
|
||||
koboldai_vars.loaded_layers = 0
|
||||
koboldai_vars.total_layers = num_tensors
|
||||
|
||||
# Some model weights are used by transformers but not by MTJ.
|
||||
# We have to materialize these weights anyways because
|
||||
# transformers will throw a tantrum otherwise. To attain
|
||||
# the least possible memory usage, we create them as meta
|
||||
# tensors, which don't take up any actual CPU or TPU memory.
|
||||
if model_spec_key is None:
|
||||
model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta")
|
||||
utils.bar.update(1)
|
||||
koboldai_vars.loaded_layers += 1
|
||||
continue
|
||||
if utils.num_shards is not None:
|
||||
utils.current_shard += 1
|
||||
|
||||
spec = model_spec[model_spec_key]
|
||||
transforms = set(spec.get("transforms", ()))
|
||||
for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)):
|
||||
model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None)
|
||||
|
||||
if not isinstance(model_dict[key], lazy_loader.LazyTensor):
|
||||
error = f"Duplicate key {repr(key)}"
|
||||
print("\n\nERROR: " + error, file=sys.stderr)
|
||||
raise RuntimeError(error)
|
||||
|
||||
tensor = model_dict[key].materialize(map_location="cpu")
|
||||
model_dict[key] = tensor.to("meta")
|
||||
|
||||
# MTJ requires certain mathematical operations to be performed
|
||||
# on tensors in order for them to be in the correct format
|
||||
if "remove_first_two_rows" in transforms:
|
||||
tensor = tensor[2:]
|
||||
if "divide_by_shards" in transforms:
|
||||
tensor /= params["cores_per_replica"]
|
||||
if "vocab_pad" in transforms:
|
||||
tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],))
|
||||
# We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config
|
||||
#if "no_transpose" not in transforms and tensor.ndim == 2:
|
||||
# tensor = tensor.T
|
||||
tensor.unsqueeze_(0)
|
||||
|
||||
|
||||
# Shard the tensor so that parts of the tensor can be used
|
||||
# on different TPU cores
|
||||
tensor = reshard_reverse(
|
||||
tensor,
|
||||
params["cores_per_replica"],
|
||||
network.state["params"][spec["module"]][spec["param"]].shape,
|
||||
)
|
||||
tensor = tensor.detach()
|
||||
# numpy does not support bfloat16
|
||||
if tensor.dtype is torch.bfloat16:
|
||||
tensor = tensor.to(torch.float32)
|
||||
tensor = jnp.array(tensor)
|
||||
if tensor.dtype is torch.float16 or tensor.dtype is torch.float32:
|
||||
tensor = tensor.bfloat16()
|
||||
network.state["params"][spec["module"]][spec["param"]] = move_xmap(
|
||||
tensor,
|
||||
np.empty(params["cores_per_replica"]),
|
||||
)
|
||||
|
||||
koboldai_vars.loaded_layers += 1
|
||||
try:
|
||||
time.sleep(0.01)
|
||||
except:
|
||||
pass
|
||||
# Some model weights are used by transformers but not by MTJ.
|
||||
# We have to materialize these weights anyways because
|
||||
# transformers will throw a tantrum otherwise. To attain
|
||||
# the least possible memory usage, we create them as meta
|
||||
# tensors, which don't take up any actual CPU or TPU memory.
|
||||
if model_spec_key is None:
|
||||
model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta")
|
||||
utils.bar.update(1)
|
||||
koboldai_vars.loaded_layers += 1
|
||||
continue
|
||||
|
||||
if utils.num_shards is not None and utils.current_shard < utils.num_shards:
|
||||
return
|
||||
spec = model_spec[model_spec_key]
|
||||
transforms = set(spec.get("transforms", ()))
|
||||
|
||||
# Check for tensors that MTJ needs that were not provided in the
|
||||
# HF model
|
||||
for mk, mv in network.state["params"].items():
|
||||
for pk, pv in mv.items():
|
||||
if isinstance(pv, PlaceholderTensor):
|
||||
# The transformers GPT-J models apparently do not
|
||||
# have embedding bias, whereas MTJ GPT-J models do,
|
||||
# so we have to supplement an embedding bias tensor
|
||||
# by creating a tensor with the necessary shape, filled
|
||||
# with zeros.
|
||||
if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b":
|
||||
mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"]))
|
||||
if not isinstance(model_dict[key], lazy_loader.LazyTensor):
|
||||
error = f"Duplicate key {repr(key)}"
|
||||
print("\n\nERROR: " + error, file=sys.stderr)
|
||||
raise RuntimeError(error)
|
||||
|
||||
else:
|
||||
error = f"{mk} {pk} could not be found in the model checkpoint"
|
||||
print("\n\nERROR: " + error, file=sys.stderr)
|
||||
raise RuntimeError(error)
|
||||
finally:
|
||||
if utils.num_shards is None or utils.current_shard >= utils.num_shards:
|
||||
utils.bar.close()
|
||||
utils.bar = None
|
||||
koboldai_vars.status_message = ""
|
||||
callback.nested = False
|
||||
if isinstance(f, zipfile.ZipExtFile):
|
||||
f.close()
|
||||
tensor = model_dict[key].materialize(map_location="cpu")
|
||||
model_dict[key] = tensor.to("meta")
|
||||
|
||||
# MTJ requires certain mathematical operations to be performed
|
||||
# on tensors in order for them to be in the correct format
|
||||
if "remove_first_two_rows" in transforms:
|
||||
tensor = tensor[2:]
|
||||
if "divide_by_shards" in transforms:
|
||||
tensor /= params["cores_per_replica"]
|
||||
if "vocab_pad" in transforms:
|
||||
tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],))
|
||||
# We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config
|
||||
#if "no_transpose" not in transforms and tensor.ndim == 2:
|
||||
# tensor = tensor.T
|
||||
tensor.unsqueeze_(0)
|
||||
|
||||
|
||||
# Shard the tensor so that parts of the tensor can be used
|
||||
# on different TPU cores
|
||||
tensor = reshard_reverse(
|
||||
tensor,
|
||||
params["cores_per_replica"],
|
||||
network.state["params"][spec["module"]][spec["param"]].shape,
|
||||
)
|
||||
tensor = tensor.detach()
|
||||
# numpy does not support bfloat16
|
||||
if tensor.dtype is torch.bfloat16:
|
||||
tensor = tensor.to(torch.float32)
|
||||
tensor = jnp.array(tensor)
|
||||
if tensor.dtype is torch.float16 or tensor.dtype is torch.float32:
|
||||
tensor = tensor.bfloat16()
|
||||
network.state["params"][spec["module"]][spec["param"]] = move_xmap(
|
||||
tensor,
|
||||
np.empty(params["cores_per_replica"]),
|
||||
)
|
||||
|
||||
koboldai_vars.loaded_layers += 1
|
||||
try:
|
||||
time.sleep(0.01)
|
||||
except:
|
||||
pass
|
||||
utils.bar.update(1)
|
||||
|
||||
if utils.num_shards is not None and utils.current_shard < utils.num_shards:
|
||||
return
|
||||
|
||||
# Check for tensors that MTJ needs that were not provided in the
|
||||
# HF model
|
||||
for mk, mv in network.state["params"].items():
|
||||
for pk, pv in mv.items():
|
||||
if isinstance(pv, PlaceholderTensor):
|
||||
# The transformers GPT-J models apparently do not
|
||||
# have embedding bias, whereas MTJ GPT-J models do,
|
||||
# so we have to supplement an embedding bias tensor
|
||||
# by creating a tensor with the necessary shape, filled
|
||||
# with zeros.
|
||||
if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b":
|
||||
mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"]))
|
||||
|
||||
else:
|
||||
error = f"{mk} {pk} could not be found in the model checkpoint"
|
||||
print("\n\nERROR: " + error, file=sys.stderr)
|
||||
raise RuntimeError(error)
|
||||
finally:
|
||||
if utils.num_shards is None or utils.current_shard >= utils.num_shards:
|
||||
utils.bar.close()
|
||||
utils.bar = None
|
||||
koboldai_vars.status_message = ""
|
||||
callback.nested = False
|
||||
callback.nested = False
|
||||
|
||||
if os.path.isdir(koboldai_vars.model.replace('/', '_')):
|
||||
|
Reference in New Issue
Block a user