From 32917fd651cdd86e791ba80f6e3deacb405a00e7 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 3 Jul 2023 17:51:54 -0500 Subject: [PATCH] Remove unused file open --- tpu_mtj_backend.py | 211 ++++++++++++++++++++++----------------------- 1 file changed, 102 insertions(+), 109 deletions(-) diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py index a5fd9d69..5a5271e2 100644 --- a/tpu_mtj_backend.py +++ b/tpu_mtj_backend.py @@ -1171,125 +1171,118 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword if callback.nested: return callback.nested = True - with zipfile.ZipFile(f, "r") as z: - try: - last_storage_key = None - zipfolder = os.path.basename(os.path.normpath(f)).split('.')[0] - f = None - current_offset = 0 - if utils.current_shard == 0: - print("\n\n\nThis model has ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), " parameters.\n") - - if utils.num_shards is None or utils.current_shard == 0: - if utils.num_shards is not None: - num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs)) - else: - num_tensors = len(model_dict) - - if socketio is None: - utils.bar = tqdm(total=num_tensors, desc="Loading model tensors") - else: - utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit)) - koboldai_vars.status_message = "Loading model" - koboldai_vars.loaded_layers = 0 - koboldai_vars.total_layers = num_tensors + try: + if utils.current_shard == 0: + print("\n\n\nThis model has ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), " parameters.\n") + if utils.num_shards is None or utils.current_shard == 0: if utils.num_shards is not None: - utils.current_shard += 1 + num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs)) + else: + num_tensors = len(model_dict) - for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)): - model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None) + if socketio is None: + utils.bar = tqdm(total=num_tensors, desc="Loading model tensors") + else: + utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit)) + koboldai_vars.status_message = "Loading model" + koboldai_vars.loaded_layers = 0 + koboldai_vars.total_layers = num_tensors - # Some model weights are used by transformers but not by MTJ. - # We have to materialize these weights anyways because - # transformers will throw a tantrum otherwise. To attain - # the least possible memory usage, we create them as meta - # tensors, which don't take up any actual CPU or TPU memory. - if model_spec_key is None: - model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta") - utils.bar.update(1) - koboldai_vars.loaded_layers += 1 - continue + if utils.num_shards is not None: + utils.current_shard += 1 - spec = model_spec[model_spec_key] - transforms = set(spec.get("transforms", ())) + for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)): + model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None) - if not isinstance(model_dict[key], lazy_loader.LazyTensor): - error = f"Duplicate key {repr(key)}" - print("\n\nERROR: " + error, file=sys.stderr) - raise RuntimeError(error) - - tensor = model_dict[key].materialize(map_location="cpu") - model_dict[key] = tensor.to("meta") - - # MTJ requires certain mathematical operations to be performed - # on tensors in order for them to be in the correct format - if "remove_first_two_rows" in transforms: - tensor = tensor[2:] - if "divide_by_shards" in transforms: - tensor /= params["cores_per_replica"] - if "vocab_pad" in transforms: - tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],)) - # We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config - #if "no_transpose" not in transforms and tensor.ndim == 2: - # tensor = tensor.T - tensor.unsqueeze_(0) - - - # Shard the tensor so that parts of the tensor can be used - # on different TPU cores - tensor = reshard_reverse( - tensor, - params["cores_per_replica"], - network.state["params"][spec["module"]][spec["param"]].shape, - ) - tensor = tensor.detach() - # numpy does not support bfloat16 - if tensor.dtype is torch.bfloat16: - tensor = tensor.to(torch.float32) - tensor = jnp.array(tensor) - if tensor.dtype is torch.float16 or tensor.dtype is torch.float32: - tensor = tensor.bfloat16() - network.state["params"][spec["module"]][spec["param"]] = move_xmap( - tensor, - np.empty(params["cores_per_replica"]), - ) - - koboldai_vars.loaded_layers += 1 - try: - time.sleep(0.01) - except: - pass + # Some model weights are used by transformers but not by MTJ. + # We have to materialize these weights anyways because + # transformers will throw a tantrum otherwise. To attain + # the least possible memory usage, we create them as meta + # tensors, which don't take up any actual CPU or TPU memory. + if model_spec_key is None: + model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta") utils.bar.update(1) + koboldai_vars.loaded_layers += 1 + continue - if utils.num_shards is not None and utils.current_shard < utils.num_shards: - return + spec = model_spec[model_spec_key] + transforms = set(spec.get("transforms", ())) - # Check for tensors that MTJ needs that were not provided in the - # HF model - for mk, mv in network.state["params"].items(): - for pk, pv in mv.items(): - if isinstance(pv, PlaceholderTensor): - # The transformers GPT-J models apparently do not - # have embedding bias, whereas MTJ GPT-J models do, - # so we have to supplement an embedding bias tensor - # by creating a tensor with the necessary shape, filled - # with zeros. - if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b": - mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"])) + if not isinstance(model_dict[key], lazy_loader.LazyTensor): + error = f"Duplicate key {repr(key)}" + print("\n\nERROR: " + error, file=sys.stderr) + raise RuntimeError(error) - else: - error = f"{mk} {pk} could not be found in the model checkpoint" - print("\n\nERROR: " + error, file=sys.stderr) - raise RuntimeError(error) - finally: - if utils.num_shards is None or utils.current_shard >= utils.num_shards: - utils.bar.close() - utils.bar = None - koboldai_vars.status_message = "" - callback.nested = False - if isinstance(f, zipfile.ZipExtFile): - f.close() + tensor = model_dict[key].materialize(map_location="cpu") + model_dict[key] = tensor.to("meta") + + # MTJ requires certain mathematical operations to be performed + # on tensors in order for them to be in the correct format + if "remove_first_two_rows" in transforms: + tensor = tensor[2:] + if "divide_by_shards" in transforms: + tensor /= params["cores_per_replica"] + if "vocab_pad" in transforms: + tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],)) + # We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config + #if "no_transpose" not in transforms and tensor.ndim == 2: + # tensor = tensor.T + tensor.unsqueeze_(0) + + + # Shard the tensor so that parts of the tensor can be used + # on different TPU cores + tensor = reshard_reverse( + tensor, + params["cores_per_replica"], + network.state["params"][spec["module"]][spec["param"]].shape, + ) + tensor = tensor.detach() + # numpy does not support bfloat16 + if tensor.dtype is torch.bfloat16: + tensor = tensor.to(torch.float32) + tensor = jnp.array(tensor) + if tensor.dtype is torch.float16 or tensor.dtype is torch.float32: + tensor = tensor.bfloat16() + network.state["params"][spec["module"]][spec["param"]] = move_xmap( + tensor, + np.empty(params["cores_per_replica"]), + ) + + koboldai_vars.loaded_layers += 1 + try: + time.sleep(0.01) + except: + pass + utils.bar.update(1) + + if utils.num_shards is not None and utils.current_shard < utils.num_shards: + return + + # Check for tensors that MTJ needs that were not provided in the + # HF model + for mk, mv in network.state["params"].items(): + for pk, pv in mv.items(): + if isinstance(pv, PlaceholderTensor): + # The transformers GPT-J models apparently do not + # have embedding bias, whereas MTJ GPT-J models do, + # so we have to supplement an embedding bias tensor + # by creating a tensor with the necessary shape, filled + # with zeros. + if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b": + mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"])) + + else: + error = f"{mk} {pk} could not be found in the model checkpoint" + print("\n\nERROR: " + error, file=sys.stderr) + raise RuntimeError(error) + finally: + if utils.num_shards is None or utils.current_shard >= utils.num_shards: + utils.bar.close() + utils.bar = None + koboldai_vars.status_message = "" + callback.nested = False callback.nested = False if os.path.isdir(koboldai_vars.model.replace('/', '_')):