From 90fd8b1845138b38d7939096e9069439f7254a95 Mon Sep 17 00:00:00 2001
From: Gnome Ann <>
Date: Mon, 20 Jun 2022 16:06:09 -0400
Subject: [PATCH] Disk cache support in CPU-only mode

---
 aiserver.py   | 25 +++++++------
 breakmodel.py | 98 +++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 111 insertions(+), 12 deletions(-)

diff --git a/aiserver.py b/aiserver.py
index f8680cf9..583705cb 100644
--- a/aiserver.py
+++ b/aiserver.py
@@ -516,10 +516,11 @@ def device_config(config):
     import breakmodel
     n_layers = utils.num_layers(config)
     if(args.breakmodel_gpulayers is not None or (utils.HAS_ACCELERATE and args.breakmodel_disklayers is not None)):
-        if(args.breakmodel_gpulayers is None):
-            args.breakmodel_gpulayers = ",".join(["0"] * torch.cuda.device_count())
         try:
-            breakmodel.gpu_blocks = list(map(int, args.breakmodel_gpulayers.split(',')))
+            if(not args.breakmodel_gpulayers):
+                breakmodel.gpu_blocks = []
+            else:
+                breakmodel.gpu_blocks = list(map(int, args.breakmodel_gpulayers.split(',')))
             assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
             s = n_layers
             for i in range(len(breakmodel.gpu_blocks)):
@@ -622,7 +623,7 @@ def device_config(config):
 def move_model_to_devices(model):
     global generator
 
-    if(not vars.breakmodel):
+    if(not utils.HAS_ACCELERATE and not vars.breakmodel):
         if(vars.usegpu):
             model = model.half().to(vars.gpu_device)
         else:
@@ -630,11 +631,8 @@ def move_model_to_devices(model):
         generator = model.generate
         return
 
-    model.half()
-    gc.collect()
-
     if(utils.HAS_ACCELERATE):
-        import accelerate
+        import breakmodel
         disk_blocks = breakmodel.disk_blocks
         gpu_blocks = breakmodel.gpu_blocks
         ram_blocks = len(vars.layers_module_names) - sum(gpu_blocks)
@@ -646,11 +644,14 @@ def move_model_to_devices(model):
             device_map[name] = device
         for name in utils.get_missing_module_names(model, list(device_map.keys())):
             device_map[name] = breakmodel.primary_device
-        accelerate.dispatch_model(model, device_map, main_device=breakmodel.primary_device, offload_buffers=True, offload_dir="accelerate-disk-cache")
+        breakmodel.dispatch_model_ex(model, device_map, main_device=breakmodel.primary_device, offload_buffers=True, offload_dir="accelerate-disk-cache")
         gc.collect()
         generator = model.generate
         return
 
+    model.half()
+    gc.collect()
+
     if(hasattr(model, "transformer")):
         model.transformer.wte.to(breakmodel.primary_device)
         model.transformer.ln_f.to(breakmodel.primary_device)
@@ -1874,7 +1875,7 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                 
                 # If we're using torch_lazy_loader, we need to get breakmodel config
                 # early so that it knows where to load the individual model tensors
-                if(vars.lazy_load and vars.hascuda and vars.breakmodel):
+                if(utils.HAS_ACCELERATE or vars.lazy_load and vars.hascuda and vars.breakmodel):
                     device_config(model_config)
 
                 # Download model from Huggingface if it does not exist, otherwise load locally
@@ -2003,6 +2004,10 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
                         if(not vars.lazy_load):
                             device_config(model.config)
                         move_model_to_devices(model)
+                    elif(utils.HAS_ACCELERATE):
+                        move_model_to_devices(model)
+                        vars.modeldim = get_hidden_size_from_model(model)
+                        generator = model.generate
                     else:
                         model = model.to('cpu').float()
                         vars.modeldim = get_hidden_size_from_model(model)
diff --git a/breakmodel.py b/breakmodel.py
index e071d25c..52000335 100644
--- a/breakmodel.py
+++ b/breakmodel.py
@@ -4,7 +4,7 @@ https://github.com/arrmansa/Basic-UI-for-GPT-J-6B-with-low-vram/blob/main/GPT-J-
 The ORIGINAL version of the patch is released under the Apache License 2.0
 Copyright 2021 arrmansa
 Copyright 2021 finetuneanon
-Copyright 2018 The Hugging Face team
+Copyright 2018, 2022 The Hugging Face team
 
 
                                  Apache License
@@ -216,11 +216,13 @@ from torch import nn
 import torch.cuda.comm
 import copy
 import gc
+import os
 import sys
 import itertools
 import bisect
 import random
-from typing import Optional
+import utils
+from typing import Dict, List, Optional, Union
 
 from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPastAndCrossAttentions
 
@@ -234,6 +236,98 @@ disk_blocks = 0
 primary_device = 0 if torch.cuda.device_count() > 0 else "cpu"
 
 
+if utils.HAS_ACCELERATE:
+    from accelerate.hooks import attach_align_device_hook_on_blocks
+    from accelerate.utils import OffloadedWeightsLoader, check_device_map, extract_submodules_state_dict, offload_state_dict
+    from accelerate import dispatch_model
+
+def dispatch_model_ex(
+    model: nn.Module,
+    device_map: Dict[str, Union[str, int, torch.device]],
+    main_device: Optional[torch.device] = None,
+    state_dict: Optional[Dict[str, torch.Tensor]] = None,
+    offload_dir: Union[str, os.PathLike] = None,
+    offload_buffers: bool = False,
+    **kwargs,
+):
+    """
+    This is a modified version of
+    https://github.com/huggingface/accelerate/blob/eeaba598f455fbd2c48661d7e816d3ff25ab050b/src/accelerate/big_modeling.py#L130
+    that still works when the main device is the CPU.
+
+    Dispatches a model according to a given device map. Layers of the model might be spread across GPUs, offloaded on
+    the CPU or even the disk.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to dispatch.
+        device_map (`Dict[str, Union[str, int, torch.device]]`):
+            A dictionary mapping module names in the models `state_dict` to the device they should go to. Note that
+            `"disk"` is accepted even if it's not a proper value for `torch.device`.
+        main_device (`str`, `int` or `torch.device`, *optional*):
+            The main execution device. Will default to the first device in the `device_map` different from `"cpu"` or
+            `"disk"`.
+        state_dict (`Dict[str, torch.Tensor]`, *optional*):
+            The state dict of the part of the model that will be kept on CPU.
+        offload_dir (`str` or `os.PathLike`):
+            The folder in which to offload the model weights (or where the model weights are already offloaded).
+        offload_buffers (`bool`, *optional*, defaults to `False`):
+            Whether or not to offload the buffers with the model parameters.
+        preload_module_classes (`List[str]`, *optional*):
+            A list of classes whose instances should load all their weights (even in the submodules) at the beginning
+            of the forward. This should only be used for classes that have submodules which are registered but not
+            called directly during the forward, for instance if a `dense` linear layer is registered, but at forward,
+            `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly.
+    """
+    if main_device != "cpu":
+        return dispatch_model(model, device_map, main_device, state_dict, offload_dir=offload_dir, offload_buffers=offload_buffers, **kwargs)
+
+    # Error early if the device map is incomplete.
+    check_device_map(model, device_map)
+
+    offload_devices = ["cpu", "disk"] if main_device != "cpu" else ["disk"]
+
+    if main_device is None:
+        main_device = [d for d in device_map.values() if d not in offload_devices][0]
+
+    cpu_modules = [name for name, device in device_map.items() if device == "cpu"] if main_device != "cpu" else []
+    if state_dict is None and len(cpu_modules) > 0:
+        state_dict = extract_submodules_state_dict(model.state_dict(), cpu_modules)
+
+    disk_modules = [name for name, device in device_map.items() if device == "disk"]
+    if offload_dir is None and len(disk_modules) > 0:
+        raise ValueError(
+            "We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules "
+            f"need to be offloaded: {', '.join(disk_modules)}."
+        )
+    if len(disk_modules) > 0 and (
+        not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json"))
+    ):
+        disk_state_dict = extract_submodules_state_dict(model.state_dict(), disk_modules)
+        offload_state_dict(offload_dir, disk_state_dict)
+
+    execution_device = {
+        name: main_device if device in offload_devices else device for name, device in device_map.items()
+    }
+    offload = {name: device in offload_devices for name, device in device_map.items()}
+    save_folder = offload_dir if len(disk_modules) > 0 else None
+    if state_dict is not None or save_folder is not None:
+        weights_map = OffloadedWeightsLoader(state_dict=state_dict, save_folder=save_folder)
+    else:
+        weights_map = None
+
+    attach_align_device_hook_on_blocks(
+        model,
+        execution_device=execution_device,
+        offload=offload,
+        offload_buffers=offload_buffers,
+        weights_map=weights_map,
+        **kwargs,
+    )
+    model.hf_device_map = device_map
+    return model
+
+
 # Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """