Basic breakmodel ui support

Seems to work
2025-06-05 21:59:24 +02:00 · 2023-06-21 13:57:32 -05:00
parent f326fc07e8
commit 0052ad401a
2 changed files with 83 additions and 51 deletions
--- a/modeling/inference_models/generic_hf_torch/class.py
+++ b/modeling/inference_models/generic_hf_torch/class.py
@@ -59,8 +59,6 @@ class model_backend(HFTorchInferenceModel):
            # Also, lazy loader doesn't support GPT-2 models
            self.lazy_load = False
        # If we're using torch_lazy_loader, we need to get breakmodel config
        # early so that it knows where to load the individual model tensors
        logger.debug(
            "lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(
                self.lazy_load,
@@ -70,6 +68,16 @@ class model_backend(HFTorchInferenceModel):
            )
        )
        # If we're using torch_lazy_loader, we need to get breakmodel config
        # early so that it knows where to load the individual model tensors
        if (
            self.lazy_load
            and utils.koboldai_vars.hascuda
            and utils.koboldai_vars.breakmodel
            and not utils.koboldai_vars.nobreakmodel
        ):
            self.breakmodel_device_config(self.model_config)
        if self.lazy_load:
            # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
            tf_kwargs.pop("low_cpu_mem_usage", None)
--- a/modeling/inference_models/hf_torch.py
+++ b/modeling/inference_models/hf_torch.py
@@ -1,17 +1,13 @@
 from __future__ import annotations
 from dataclasses import dataclass
 import gc
 import os
 import time
 import bisect
 import zipfile
 import functools
 import itertools
 import traceback
 import contextlib
-from accelerate.big_modeling import load_checkpoint_and_dispatch
+from torch import nn
 from accelerate.utils.modeling import infer_auto_device_map, load_checkpoint_in_model
 from tqdm.auto import tqdm
 from typing import Dict, List, Optional, Union
 import torch
@@ -41,17 +37,36 @@ from modeling.inference_model import (
    use_core_manipulations,
 )
 try:
    import accelerate.utils
 except ModuleNotFoundError as e:
    if not utils.koboldai_vars.use_colab_tpu:
        raise e
 # When set to true, messages will appear in the console if samplers are not
 # changing the scores. Keep in mind some samplers don't always change the
 # scores for each token.
 LOG_SAMPLER_NO_EFFECT = False
 class BreakmodelConfig:
    def __init__(self) -> None:
        self.disk_blocks = 0
        self.gpu_blocks = []
        self.primary_device = 0 if torch.cuda.device_count() > 0 else "cpu"
    def get_device_map(self, model: nn.Module) -> dict:
        ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks)
        cumulative_gpu_blocks = tuple(itertools.accumulate(self.gpu_blocks))
        device_map = {}
        for name in utils.layers_module_names:
            layer = int(name.rsplit(".", 1)[1])
            device = (
                ("disk" if layer < self.disk_blocks else "cpu")
                if layer < ram_blocks
                else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
            )
            device_map[name] = device
        for name in utils.get_missing_module_names(model, list(device_map.keys())):
            device_map[name] = self.primary_device
        return device_map
 class HFTorchInferenceModel(HFInferenceModel):
    def __init__(self) -> None:
@@ -80,6 +95,16 @@ class HFTorchInferenceModel(HFInferenceModel):
            post_token_probs=True,
        )
        self._old_stopping_criteria = None
        self.breakmodel_config = BreakmodelConfig()
    def set_input_parameters(self, parameters):
        ret = super().set_input_parameters(parameters)
        # Hook onto input param setting for setting breakmodel stuff
        self.breakmodel_config.gpu_blocks = self.layers
        self.breakmodel_config.disk_blocks = self.disk_layers
        return ret
    def _apply_warpers(
        self, scores: torch.Tensor, input_ids: torch.Tensor
@@ -278,17 +303,20 @@ class HFTorchInferenceModel(HFInferenceModel):
        # Try to determine model type from either AutoModel or falling back to legacy
        try:
-            print("[HUGE SKELETON] LOADING FROM PRETRAINED")
+            with lazy_loader.use_lazy_load(dematerialized_modules=True):
                metamodel = AutoModelForCausalLM.from_config(self.model_config)
                device_map = self.breakmodel_config.get_device_map(metamodel)
            with lazy_loader.use_lazy_load(
                enable=True,
                # DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!!
                # dematerialized_modules=True,
                dematerialized_modules=False,
            ):
                print(device_map)
                model = AutoModelForCausalLM.from_pretrained(
                    location,
-                    device_map="auto",
+                    # device_map="auto",
-                    # max_memory={0: "10GiB", 1: "7GiB", "cpu": "20GiB"},
+                    device_map=device_map,
                    offload_folder="accelerate-disk-cache",
                    torch_dtype=torch.float16,
                    **tf_kwargs,
@@ -389,18 +417,19 @@ class HFTorchInferenceModel(HFInferenceModel):
            yield False
    def breakmodel_device_list(self, n_layers, primary=None, selected=None):
        return
        # TODO: Find a better place for this or rework this
        device_count = torch.cuda.device_count()
        if device_count < 2:
            primary = None
        logger.debug("n_layers: {}".format(n_layers))
-        logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks))
+        logger.debug("gpu blocks: {}".format(self.breakmodel_config.gpu_blocks))
-        gpu_blocks = breakmodel.gpu_blocks + (
+
-            device_count - len(breakmodel.gpu_blocks)
+        gpu_blocks = self.breakmodel_config.gpu_blocks + (
            device_count - len(self.breakmodel_config.gpu_blocks)
        ) * [0]
        print(f"{Colors.YELLOW}       DEVICE ID  |  LAYERS  |  DEVICE NAME{Colors.END}")
        for i in range(device_count):
            name = torch.cuda.get_device_name(i)
            if len(name) > 47:
@@ -410,75 +439,70 @@ class HFTorchInferenceModel(HFInferenceModel):
            print(
                f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else '  '} {'(primary)' if i == primary else ' '*9} {i:3}  {sep_color}|{row_color}     {gpu_blocks[i]:3}  {sep_color}|{row_color}  {name}{Colors.END}"
            )
        row_color = Colors.END
        sep_color = Colors.YELLOW
        print(
-            f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else '  '} {' '*9} N/A  {sep_color}|{row_color}     {breakmodel.disk_blocks:3}  {sep_color}|{row_color}  (Disk cache){Colors.END}"
+            f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else '  '} {' '*9} N/A  {sep_color}|{row_color}     {self.breakmodel_config.disk_blocks:3}  {sep_color}|{row_color}  (Disk cache){Colors.END}"
        )
        print(
            f"{row_color}   {' '*9} N/A  {sep_color}|{row_color}     {n_layers:3}  {sep_color}|{row_color}  (CPU){Colors.END}"
        )
    def breakmodel_device_config(self, config):
        # TODO: Find a better place for this or rework this
        return
        global breakmodel, generator
        import breakmodel
        n_layers = utils.num_layers(config)
-        logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks))
+        logger.debug("gpu blocks before modification: {}".format(self.breakmodel_config.gpu_blocks))
        if utils.args.cpu:
-            breakmodel.gpu_blocks = [0] * n_layers
+            self.breakmodel_config.gpu_blocks = [0] * n_layers
            return
-        elif breakmodel.gpu_blocks == []:
+        elif self.breakmodel_config.gpu_blocks == []:
            logger.info("Breakmodel not specified, assuming GPU 0")
-            breakmodel.gpu_blocks = [n_layers]
+            self.breakmodel_config.gpu_blocks = [n_layers]
            n_layers = 0
        else:
            s = n_layers
-            for i in range(len(breakmodel.gpu_blocks)):
+            for i in range(len(self.breakmodel_config.gpu_blocks)):
-                if breakmodel.gpu_blocks[i] <= -1:
+                if self.breakmodel_config.gpu_blocks[i] <= -1:
-                    breakmodel.gpu_blocks[i] = s
+                    self.breakmodel_config.gpu_blocks[i] = s
                    break
                else:
-                    s -= breakmodel.gpu_blocks[i]
+                    s -= self.breakmodel_config.gpu_blocks[i]
-            assert sum(breakmodel.gpu_blocks) <= n_layers
+            assert sum(self.breakmodel_config.gpu_blocks) <= n_layers
-            n_layers -= sum(breakmodel.gpu_blocks)
+            n_layers -= sum(self.breakmodel_config.gpu_blocks)
-            if breakmodel.disk_blocks is not None:
+            if self.breakmodel_config.disk_blocks is not None:
-                assert breakmodel.disk_blocks <= n_layers
+                assert self.breakmodel_config.disk_blocks <= n_layers
-                n_layers -= breakmodel.disk_blocks
+                n_layers -= self.breakmodel_config.disk_blocks
        logger.init_ok("Final device configuration:", status="Info")
-        self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
+        self.breakmodel_device_list(n_layers, primary=self.breakmodel_config.primary_device)
        with open(
            "settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w"
        ) as file:
            file.write(
                "{}\n{}".format(
-                    ",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks
+                    ",".join(map(str, self.breakmodel_config.gpu_blocks)), self.breakmodel_config.disk_blocks
                )
            )
        # If all layers are on the same device, use the old GPU generation mode
-        while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
+        while len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[-1] == 0:
-            breakmodel.gpu_blocks.pop()
+            self.breakmodel_config.gpu_blocks.pop()
        self.breakmodel = True
-        if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (
+        if len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[-1] in (
            -1,
            utils.num_layers(config),
        ):
            logger.debug("All layers on same GPU. Breakmodel disabled")
            self.breakmodel = False
            self.usegpu = True
-            utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1
+            utils.koboldai_vars.gpu_device = len(self.breakmodel_config.gpu_blocks) - 1
            return
-        if not breakmodel.gpu_blocks:
+        if not self.breakmodel_config.gpu_blocks:
            logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
            self.breakmodel = False
            self.usegpu = False