Use safetensors only when available

2025-06-05 21:59:24 +02:00 · 2023-04-25 18:32:37 -05:00
parent 0268305cfe
commit e28e268a2d
1 changed files with 55 additions and 43 deletions
--- a/modeling/lazy_loader.py
+++ b/modeling/lazy_loader.py
@@ -54,10 +54,17 @@ import numpy as np
 import collections
 import _codecs
 import os
 import safetensors
 from torch.nn import Module
 from typing import Any, Callable, Dict, Optional, Tuple, Type, Union
 # Safetensors is a dependency for the local version, TPU/Colab doesn't
 # support it yet.
 try:
    import safetensors
    HAS_SAFETENSORS = True
 except ModuleNotFoundError:
    HAS_SAFETENSORS = False
 import utils
@@ -382,6 +389,51 @@ def safetensors_load_tensor_independently(
        return f.get_tensor(tensor_key)
 def patch_safetensors():
    # Safetensors load patch
    import transformers
    def safetensors_load(checkpoint_file: str) -> dict:
        # Monkeypatch applied to safetensors.torch.load_file
        if utils.koboldai_vars.hascuda:
            # Use GPU as intermediary whenever possible, lowers RAM usage
            # by a significant amount while making loading slightly slower
            # (70 tensors/s -> 65 tensor/s). The memory savings probably
            # shouldn't be the happening, maybe there's a memory leak
            # somewhere in our pipeline with CPU tensors.
            intermediary_device = "cuda"
        else:
            intermediary_device = "cpu"
        tensors = {}
        with safetensors.safe_open(
            checkpoint_file, framework="pt", device=intermediary_device,
        ) as f:
            for key in f.keys():
                tensors[key] = None
        for key in tensors.keys():
            tensors[key] = SafetensorsLazyTensor(
                checkpoint_file=checkpoint_file, key=key, location=intermediary_device,
            )
        if callback is not None:
            callback(
                tensors,
                f=checkpoint_file,
                map_location=None,
                pickle_module=pickle,
                is_safetensors=True,
            )
        return tensors
    transformers.modeling_utils.safe_load_file = safetensors_load
@contextlib.contextmanager
 def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler):
    try:
@@ -441,48 +493,8 @@ def use_lazy_load(
        torch.load = torch_load
-        # Safetensors load patch
+        if HAS_SAFETENSORS:
-        import transformers
+            patch_safetensors()
        def safetensors_load(checkpoint_file: str) -> dict:
            # Monkeypatch applied to safetensors.torch.load_file
            if utils.koboldai_vars.hascuda:
                # Use GPU as intermediary whenever possible, lowers RAM usage
                # by a significant amount while making loading slightly slower
                # (70 tensors/s -> 65 tensor/s). The memory savings probably
                # shouldn't be the happening, maybe there's a memory leak
                # somewhere in our pipeline with CPU tensors.
                intermediary_device = "cuda"
            else:
                intermediary_device = "cpu"
            tensors = {}
            with safetensors.safe_open(
                checkpoint_file, framework="pt", device=intermediary_device,
            ) as f:
                for key in f.keys():
                    tensors[key] = None
            for key in tensors.keys():
                tensors[key] = SafetensorsLazyTensor(
                    checkpoint_file=checkpoint_file, key=key, location=intermediary_device,
                )
            if callback is not None:
                callback(
                    tensors,
                    f=checkpoint_file,
                    map_location=None,
                    pickle_module=pickle,
                    is_safetensors=True,
                )
            return tensors
        transformers.modeling_utils.safe_load_file = safetensors_load
        if dematerialized_modules:
            if use_accelerate_init_empty_weights: