Just use accelerate on tpu

2025-06-05 21:59:24 +02:00 · 2023-07-03 17:18:48 -05:00
parent 1bb2d2621c
commit 7f869a54d8
2 changed files with 6 additions and 151 deletions
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -56,6 +56,7 @@ import collections
 import _codecs
 import os
 from typing import Any, Callable, Dict, Optional, Tuple, Type
 import accelerate
 from torch.nn import Module
 from torch.storage import UntypedStorage
@@ -64,6 +65,7 @@ from torch.storage import UntypedStorage
 # support it yet.
 try:
    import safetensors
    HAS_SAFETENSORS = True
 except ModuleNotFoundError:
    HAS_SAFETENSORS = False
@@ -71,17 +73,6 @@ except ModuleNotFoundError:
 import utils
 from logger import logger
 # Accelerate is used to load with empty modules. TPU version doesn't come
 # packaged with it so we use an in-house solution in that case
 try:
    import accelerate
    HAS_ACCELERATE = True
 except ModuleNotFoundError:
    HAS_ACCELERATE = False
 _EXTRA_STATE_KEY_SUFFIX = "_extra_state"
 # Storage of zipfile handles for each shard
 torch_checkpoint_file_handles = {}
@@ -331,116 +322,6 @@ def _rebuild_tensor(lazy_storage: LazyTensor, storage_offset, shape, stride):
    )
    return lazy_storage
 # Modified version of https://github.com/pytorch/pytorch/blob/v1.11.0-rc4/torch/nn/modules/module.py#L1346-L1438
 def _load_from_state_dict(
    self,
    state_dict,
    prefix,
    local_metadata,
    strict,
    missing_keys,
    unexpected_keys,
    error_msgs,
 ):
    for hook in self._load_state_dict_pre_hooks.values():
        hook(
            state_dict,
            prefix,
            local_metadata,
            strict,
            missing_keys,
            unexpected_keys,
            error_msgs,
        )
    persistent_buffers = {
        k: v
        for k, v in self._buffers.items()
        if k not in self._non_persistent_buffers_set
    }
    local_name_params = itertools.chain(
        self._parameters.items(), persistent_buffers.items()
    )
    local_state = {k: v for k, v in local_name_params if v is not None}
    for name, param in local_state.items():
        key = prefix + name
        if key in state_dict:
            input_param = state_dict[key]
            if not torch.overrides.is_tensor_like(input_param):
                error_msgs.append(
                    'While copying the parameter named "{}", '
                    "expected torch.Tensor or Tensor-like object from checkpoint but "
                    "received {}".format(key, type(input_param))
                )
                continue
            # This is used to avoid copying uninitialized parameters into
            # non-lazy modules, since they dont have the hook to do the checks
            # in such case, it will error when accessing the .shape attribute.
            is_param_lazy = torch.nn.parameter.is_lazy(param)
            # Backward compatibility: loading 1-dim tensor from 0.3.* to version 0.4+
            if (
                not is_param_lazy
                and len(param.shape) == 0
                and len(input_param.shape) == 1
            ):
                input_param = input_param[0]
            if not is_param_lazy and input_param.shape != param.shape:
                # local shape should match the one in checkpoint
                error_msgs.append(
                    "size mismatch for {}: copying a param with shape {} from checkpoint, "
                    "the shape in current model is {}.".format(
                        key, input_param.shape, param.shape
                    )
                )
                continue
            try:
                with torch.no_grad():
                    # param.copy_(input_param)
                    new_param = torch.nn.Parameter(
                        input_param, requires_grad=param.requires_grad
                    )  # This line is new
                    if name in self._parameters:  # This line is new
                        self._parameters[name] = new_param  # This line is new
                    if name in persistent_buffers:  # This line is new
                        self._buffers[name] = new_param  # This line is new
            except Exception as ex:
                error_msgs.append(
                    'While copying the parameter named "{}", '
                    "whose dimensions in the model are {} and "
                    "whose dimensions in the checkpoint are {}, "
                    "an exception occurred : {}.".format(
                        key, param.size(), input_param.size(), ex.args
                    )
                )
        elif strict:
            missing_keys.append(key)
    extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
    if (
        hasattr(Module, "set_extra_state")
        and getattr(self.__class__, "set_extra_state", Module.set_extra_state)
        is not Module.set_extra_state
    ):  # if getattr(self.__class__, "set_extra_state", Module.set_extra_state) is not Module.set_extra_state:
        if extra_state_key in state_dict:
            self.set_extra_state(state_dict[extra_state_key])
        elif strict:
            missing_keys.append(extra_state_key)
    elif strict and (extra_state_key in state_dict):
        unexpected_keys.append(extra_state_key)
    if strict:
        for key in state_dict.keys():
            if key.startswith(prefix) and key != extra_state_key:
                input_name = key[len(prefix) :]
                input_name = input_name.split(".", 1)[
                    0
                ]  # get the name of param/buffer/child
                if input_name not in self._modules and input_name not in local_state:
                    unexpected_keys.append(key)
 def safetensors_load_tensor_independently(
    checkpoint_file: str, tensor_key: str, device: Any
@@ -572,29 +453,8 @@ def use_lazy_load(
            patch_safetensors(callback)
        if dematerialized_modules:
-            if HAS_ACCELERATE:
+            init_empty_weights = accelerate.init_empty_weights()
-                init_empty_weights = accelerate.init_empty_weights()
+            init_empty_weights.__enter__()
                init_empty_weights.__enter__()
            else:
                # TPU doesn't use accelerate package
                old_linear_init = torch.nn.Linear.__init__
                old_embedding_init = torch.nn.Embedding.__init__
                old_layernorm_init = torch.nn.LayerNorm.__init__
                def linear_init(self, *args, device=None, **kwargs):
                    return old_linear_init(self, *args, device="meta", **kwargs)
                def embedding_init(self, *args, device=None, **kwargs):
                    return old_embedding_init(self, *args, device="meta", **kwargs)
                def layernorm_init(self, *args, device=None, **kwargs):
                    return old_layernorm_init(self, *args, device="meta", **kwargs)
                torch.nn.Linear.__init__ = linear_init
                torch.nn.Embedding.__init__ = embedding_init
                torch.nn.LayerNorm.__init__ = layernorm_init
                old_load_from_state_dict = torch.nn.Module._load_from_state_dict
                torch.nn.Module._load_from_state_dict = _load_from_state_dict
        with use_custom_unpickler(_LazyUnpickler):
            yield True
@@ -609,13 +469,7 @@ def use_lazy_load(
        )
        if dematerialized_modules:
-            if HAS_ACCELERATE:
+            init_empty_weights.__exit__(None, None, None)
                init_empty_weights.__exit__(None, None, None)
            else:
                torch.nn.Linear.__init__ = old_linear_init
                torch.nn.Embedding.__init__ = old_embedding_init
                torch.nn.LayerNorm.__init__ = old_layernorm_init
                torch.nn.Module._load_from_state_dict = old_load_from_state_dict
 def post_load_cleanup() -> None:
--- a/requirements_mtj.txt
+++ b/requirements_mtj.txt
@@ -34,3 +34,4 @@ ijson
 ftfy
 pydub
 sentencepiece
 accelerate==0.18.0