Remove unused file open

2025-06-05 21:59:24 +02:00 · 2023-07-03 17:51:54 -05:00
parent 7f869a54d8
commit 32917fd651
1 changed files with 102 additions and 109 deletions
--- a/tpu_mtj_backend.py
+++ b/tpu_mtj_backend.py
@@ -1171,125 +1171,118 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword
        if callback.nested:
            return
        callback.nested = True
-        with zipfile.ZipFile(f, "r") as z:
+        try:
-            try:
+            if utils.current_shard == 0:
-                last_storage_key = None
+                print("\n\n\nThis model has  ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), "  parameters.\n")
                zipfolder = os.path.basename(os.path.normpath(f)).split('.')[0]
                f = None
                current_offset = 0
                if utils.current_shard == 0:
                    print("\n\n\nThis model has  ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), "  parameters.\n")
                if utils.num_shards is None or utils.current_shard == 0:
                    if utils.num_shards is not None:
                        num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
                    else:
                        num_tensors = len(model_dict)
                    if socketio is None:
                        utils.bar = tqdm(total=num_tensors, desc="Loading model tensors")
                    else:
                        utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit))
                    koboldai_vars.status_message = "Loading model"
                    koboldai_vars.loaded_layers = 0
                    koboldai_vars.total_layers = num_tensors
            if utils.num_shards is None or utils.current_shard == 0:
                if utils.num_shards is not None:
-                    utils.current_shard += 1
+                    num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
                else:
                    num_tensors = len(model_dict)
-                for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)):
+                if socketio is None:
-                    model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None)
+                    utils.bar = tqdm(total=num_tensors, desc="Loading model tensors")
                else:
                    utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit))
                koboldai_vars.status_message = "Loading model"
                koboldai_vars.loaded_layers = 0
                koboldai_vars.total_layers = num_tensors
-                    # Some model weights are used by transformers but not by MTJ.
+            if utils.num_shards is not None:
-                    # We have to materialize these weights anyways because
+                utils.current_shard += 1
                    # transformers will throw a tantrum otherwise.  To attain
                    # the least possible memory usage, we create them as meta
                    # tensors, which don't take up any actual CPU or TPU memory.
                    if model_spec_key is None:
                        model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta")
                        utils.bar.update(1)
                        koboldai_vars.loaded_layers += 1
                        continue
-                    spec = model_spec[model_spec_key]
+            for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)):
-                    transforms = set(spec.get("transforms", ()))
+                model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None)
-                    if not isinstance(model_dict[key], lazy_loader.LazyTensor):
+                # Some model weights are used by transformers but not by MTJ.
-                        error = f"Duplicate key {repr(key)}"
+                # We have to materialize these weights anyways because
-                        print("\n\nERROR:  " + error, file=sys.stderr)
+                # transformers will throw a tantrum otherwise.  To attain
-                        raise RuntimeError(error)
+                # the least possible memory usage, we create them as meta
-
+                # tensors, which don't take up any actual CPU or TPU memory.
-                    tensor = model_dict[key].materialize(map_location="cpu")
+                if model_spec_key is None:
-                    model_dict[key] = tensor.to("meta")
+                    model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta")
                    # MTJ requires certain mathematical operations to be performed
                    # on tensors in order for them to be in the correct format
                    if "remove_first_two_rows" in transforms:
                        tensor = tensor[2:]
                    if "divide_by_shards" in transforms:
                        tensor /= params["cores_per_replica"]
                    if "vocab_pad" in transforms:
                        tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],))
                    # We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config
                    #if "no_transpose" not in transforms and tensor.ndim == 2:
                    #    tensor = tensor.T
                    tensor.unsqueeze_(0)
                    # Shard the tensor so that parts of the tensor can be used
                    # on different TPU cores
                    tensor = reshard_reverse(
                        tensor,
                        params["cores_per_replica"],
                        network.state["params"][spec["module"]][spec["param"]].shape,
                    )
                    tensor = tensor.detach()
                    # numpy does not support bfloat16
                    if tensor.dtype is torch.bfloat16:
                      tensor = tensor.to(torch.float32)
                    tensor = jnp.array(tensor)
                    if tensor.dtype is torch.float16 or tensor.dtype is torch.float32:
                        tensor = tensor.bfloat16()
                    network.state["params"][spec["module"]][spec["param"]] = move_xmap(
                        tensor,
                        np.empty(params["cores_per_replica"]),
                    )
                    koboldai_vars.loaded_layers += 1
                    try:
                        time.sleep(0.01)
                    except:
                        pass
                    utils.bar.update(1)
                    koboldai_vars.loaded_layers += 1
                    continue
-                if utils.num_shards is not None and utils.current_shard < utils.num_shards:
+                spec = model_spec[model_spec_key]
-                    return
+                transforms = set(spec.get("transforms", ()))
-                # Check for tensors that MTJ needs that were not provided in the
+                if not isinstance(model_dict[key], lazy_loader.LazyTensor):
-                # HF model
+                    error = f"Duplicate key {repr(key)}"
-                for mk, mv in network.state["params"].items():
+                    print("\n\nERROR:  " + error, file=sys.stderr)
-                    for pk, pv in mv.items():
+                    raise RuntimeError(error)
                        if isinstance(pv, PlaceholderTensor):
                            # The transformers GPT-J models apparently do not
                            # have embedding bias, whereas MTJ GPT-J models do,
                            # so we have to supplement an embedding bias tensor
                            # by creating a tensor with the necessary shape, filled
                            # with zeros.
                            if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b":
                                mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"]))
-                            else:
+                tensor = model_dict[key].materialize(map_location="cpu")
-                                error = f"{mk} {pk} could not be found in the model checkpoint"
+                model_dict[key] = tensor.to("meta")
-                                print("\n\nERROR:  " + error, file=sys.stderr)
+
-                                raise RuntimeError(error)
+                # MTJ requires certain mathematical operations to be performed
-            finally:
+                # on tensors in order for them to be in the correct format
-                if utils.num_shards is None or utils.current_shard >= utils.num_shards:
+                if "remove_first_two_rows" in transforms:
-                    utils.bar.close()
+                    tensor = tensor[2:]
-                    utils.bar = None
+                if "divide_by_shards" in transforms:
-                    koboldai_vars.status_message = ""
+                    tensor /= params["cores_per_replica"]
-                callback.nested = False
+                if "vocab_pad" in transforms:
-                if isinstance(f, zipfile.ZipExtFile):
+                    tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],))
-                    f.close()
+                # We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config
                #if "no_transpose" not in transforms and tensor.ndim == 2:
                #    tensor = tensor.T
                tensor.unsqueeze_(0)
                # Shard the tensor so that parts of the tensor can be used
                # on different TPU cores
                tensor = reshard_reverse(
                    tensor,
                    params["cores_per_replica"],
                    network.state["params"][spec["module"]][spec["param"]].shape,
                )
                tensor = tensor.detach()
                # numpy does not support bfloat16
                if tensor.dtype is torch.bfloat16:
                    tensor = tensor.to(torch.float32)
                tensor = jnp.array(tensor)
                if tensor.dtype is torch.float16 or tensor.dtype is torch.float32:
                    tensor = tensor.bfloat16()
                network.state["params"][spec["module"]][spec["param"]] = move_xmap(
                    tensor,
                    np.empty(params["cores_per_replica"]),
                )
                koboldai_vars.loaded_layers += 1
                try:
                    time.sleep(0.01)
                except:
                    pass
                utils.bar.update(1)
            if utils.num_shards is not None and utils.current_shard < utils.num_shards:
                return
            # Check for tensors that MTJ needs that were not provided in the
            # HF model
            for mk, mv in network.state["params"].items():
                for pk, pv in mv.items():
                    if isinstance(pv, PlaceholderTensor):
                        # The transformers GPT-J models apparently do not
                        # have embedding bias, whereas MTJ GPT-J models do,
                        # so we have to supplement an embedding bias tensor
                        # by creating a tensor with the necessary shape, filled
                        # with zeros.
                        if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b":
                            mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"]))
                        else:
                            error = f"{mk} {pk} could not be found in the model checkpoint"
                            print("\n\nERROR:  " + error, file=sys.stderr)
                            raise RuntimeError(error)
        finally:
            if utils.num_shards is None or utils.current_shard >= utils.num_shards:
                utils.bar.close()
                utils.bar = None
                koboldai_vars.status_message = ""
            callback.nested = False
    callback.nested = False
    if os.path.isdir(koboldai_vars.model.replace('/', '_')):