From e28e268a2d413cdb64ff5b970881ca27fa24d887 Mon Sep 17 00:00:00 2001 From: onesome Date: Tue, 25 Apr 2023 18:32:37 -0500 Subject: [PATCH] Use safetensors only when available --- modeling/lazy_loader.py | 98 +++++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 7ce94819..36a32cea 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -54,10 +54,17 @@ import numpy as np import collections import _codecs import os -import safetensors from torch.nn import Module from typing import Any, Callable, Dict, Optional, Tuple, Type, Union +# Safetensors is a dependency for the local version, TPU/Colab doesn't +# support it yet. +try: + import safetensors + HAS_SAFETENSORS = True +except ModuleNotFoundError: + HAS_SAFETENSORS = False + import utils @@ -382,6 +389,51 @@ def safetensors_load_tensor_independently( return f.get_tensor(tensor_key) +def patch_safetensors(): + # Safetensors load patch + import transformers + + def safetensors_load(checkpoint_file: str) -> dict: + # Monkeypatch applied to safetensors.torch.load_file + + if utils.koboldai_vars.hascuda: + # Use GPU as intermediary whenever possible, lowers RAM usage + # by a significant amount while making loading slightly slower + # (70 tensors/s -> 65 tensor/s). The memory savings probably + # shouldn't be the happening, maybe there's a memory leak + # somewhere in our pipeline with CPU tensors. + intermediary_device = "cuda" + else: + intermediary_device = "cpu" + + tensors = {} + + with safetensors.safe_open( + checkpoint_file, framework="pt", device=intermediary_device, + ) as f: + for key in f.keys(): + tensors[key] = None + + for key in tensors.keys(): + + tensors[key] = SafetensorsLazyTensor( + checkpoint_file=checkpoint_file, key=key, location=intermediary_device, + ) + + if callback is not None: + callback( + tensors, + f=checkpoint_file, + map_location=None, + pickle_module=pickle, + is_safetensors=True, + ) + + return tensors + + transformers.modeling_utils.safe_load_file = safetensors_load + + @contextlib.contextmanager def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler): try: @@ -441,48 +493,8 @@ def use_lazy_load( torch.load = torch_load - # Safetensors load patch - import transformers - - def safetensors_load(checkpoint_file: str) -> dict: - # Monkeypatch applied to safetensors.torch.load_file - - if utils.koboldai_vars.hascuda: - # Use GPU as intermediary whenever possible, lowers RAM usage - # by a significant amount while making loading slightly slower - # (70 tensors/s -> 65 tensor/s). The memory savings probably - # shouldn't be the happening, maybe there's a memory leak - # somewhere in our pipeline with CPU tensors. - intermediary_device = "cuda" - else: - intermediary_device = "cpu" - - tensors = {} - - with safetensors.safe_open( - checkpoint_file, framework="pt", device=intermediary_device, - ) as f: - for key in f.keys(): - tensors[key] = None - - for key in tensors.keys(): - - tensors[key] = SafetensorsLazyTensor( - checkpoint_file=checkpoint_file, key=key, location=intermediary_device, - ) - - if callback is not None: - callback( - tensors, - f=checkpoint_file, - map_location=None, - pickle_module=pickle, - is_safetensors=True, - ) - - return tensors - - transformers.modeling_utils.safe_load_file = safetensors_load + if HAS_SAFETENSORS: + patch_safetensors() if dematerialized_modules: if use_accelerate_init_empty_weights: