mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Basic breakmodel ui support
Seems to work
This commit is contained in:
@@ -59,8 +59,6 @@ class model_backend(HFTorchInferenceModel):
|
||||
# Also, lazy loader doesn't support GPT-2 models
|
||||
self.lazy_load = False
|
||||
|
||||
# If we're using torch_lazy_loader, we need to get breakmodel config
|
||||
# early so that it knows where to load the individual model tensors
|
||||
logger.debug(
|
||||
"lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(
|
||||
self.lazy_load,
|
||||
@@ -70,6 +68,16 @@ class model_backend(HFTorchInferenceModel):
|
||||
)
|
||||
)
|
||||
|
||||
# If we're using torch_lazy_loader, we need to get breakmodel config
|
||||
# early so that it knows where to load the individual model tensors
|
||||
if (
|
||||
self.lazy_load
|
||||
and utils.koboldai_vars.hascuda
|
||||
and utils.koboldai_vars.breakmodel
|
||||
and not utils.koboldai_vars.nobreakmodel
|
||||
):
|
||||
self.breakmodel_device_config(self.model_config)
|
||||
|
||||
if self.lazy_load:
|
||||
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
||||
|
@@ -1,17 +1,13 @@
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
|
||||
import gc
|
||||
import os
|
||||
import time
|
||||
import bisect
|
||||
import zipfile
|
||||
import functools
|
||||
import itertools
|
||||
import traceback
|
||||
import contextlib
|
||||
from accelerate.big_modeling import load_checkpoint_and_dispatch
|
||||
from accelerate.utils.modeling import infer_auto_device_map, load_checkpoint_in_model
|
||||
from tqdm.auto import tqdm
|
||||
from torch import nn
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import torch
|
||||
@@ -41,17 +37,36 @@ from modeling.inference_model import (
|
||||
use_core_manipulations,
|
||||
)
|
||||
|
||||
try:
|
||||
import accelerate.utils
|
||||
except ModuleNotFoundError as e:
|
||||
if not utils.koboldai_vars.use_colab_tpu:
|
||||
raise e
|
||||
|
||||
# When set to true, messages will appear in the console if samplers are not
|
||||
# changing the scores. Keep in mind some samplers don't always change the
|
||||
# scores for each token.
|
||||
LOG_SAMPLER_NO_EFFECT = False
|
||||
|
||||
class BreakmodelConfig:
|
||||
def __init__(self) -> None:
|
||||
self.disk_blocks = 0
|
||||
self.gpu_blocks = []
|
||||
self.primary_device = 0 if torch.cuda.device_count() > 0 else "cpu"
|
||||
|
||||
def get_device_map(self, model: nn.Module) -> dict:
|
||||
ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks)
|
||||
cumulative_gpu_blocks = tuple(itertools.accumulate(self.gpu_blocks))
|
||||
device_map = {}
|
||||
|
||||
for name in utils.layers_module_names:
|
||||
layer = int(name.rsplit(".", 1)[1])
|
||||
device = (
|
||||
("disk" if layer < self.disk_blocks else "cpu")
|
||||
if layer < ram_blocks
|
||||
else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
|
||||
)
|
||||
device_map[name] = device
|
||||
|
||||
for name in utils.get_missing_module_names(model, list(device_map.keys())):
|
||||
device_map[name] = self.primary_device
|
||||
|
||||
return device_map
|
||||
|
||||
|
||||
class HFTorchInferenceModel(HFInferenceModel):
|
||||
def __init__(self) -> None:
|
||||
@@ -80,6 +95,16 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
post_token_probs=True,
|
||||
)
|
||||
self._old_stopping_criteria = None
|
||||
self.breakmodel_config = BreakmodelConfig()
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
ret = super().set_input_parameters(parameters)
|
||||
|
||||
# Hook onto input param setting for setting breakmodel stuff
|
||||
self.breakmodel_config.gpu_blocks = self.layers
|
||||
self.breakmodel_config.disk_blocks = self.disk_layers
|
||||
|
||||
return ret
|
||||
|
||||
def _apply_warpers(
|
||||
self, scores: torch.Tensor, input_ids: torch.Tensor
|
||||
@@ -278,17 +303,20 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
|
||||
# Try to determine model type from either AutoModel or falling back to legacy
|
||||
try:
|
||||
print("[HUGE SKELETON] LOADING FROM PRETRAINED")
|
||||
with lazy_loader.use_lazy_load(dematerialized_modules=True):
|
||||
metamodel = AutoModelForCausalLM.from_config(self.model_config)
|
||||
device_map = self.breakmodel_config.get_device_map(metamodel)
|
||||
|
||||
with lazy_loader.use_lazy_load(
|
||||
enable=True,
|
||||
# DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!!
|
||||
# dematerialized_modules=True,
|
||||
dematerialized_modules=False,
|
||||
):
|
||||
print(device_map)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
location,
|
||||
device_map="auto",
|
||||
# max_memory={0: "10GiB", 1: "7GiB", "cpu": "20GiB"},
|
||||
# device_map="auto",
|
||||
device_map=device_map,
|
||||
offload_folder="accelerate-disk-cache",
|
||||
torch_dtype=torch.float16,
|
||||
**tf_kwargs,
|
||||
@@ -389,18 +417,19 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
yield False
|
||||
|
||||
def breakmodel_device_list(self, n_layers, primary=None, selected=None):
|
||||
return
|
||||
# TODO: Find a better place for this or rework this
|
||||
|
||||
device_count = torch.cuda.device_count()
|
||||
if device_count < 2:
|
||||
primary = None
|
||||
|
||||
logger.debug("n_layers: {}".format(n_layers))
|
||||
logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks))
|
||||
gpu_blocks = breakmodel.gpu_blocks + (
|
||||
device_count - len(breakmodel.gpu_blocks)
|
||||
logger.debug("gpu blocks: {}".format(self.breakmodel_config.gpu_blocks))
|
||||
|
||||
gpu_blocks = self.breakmodel_config.gpu_blocks + (
|
||||
device_count - len(self.breakmodel_config.gpu_blocks)
|
||||
) * [0]
|
||||
|
||||
print(f"{Colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{Colors.END}")
|
||||
|
||||
for i in range(device_count):
|
||||
name = torch.cuda.get_device_name(i)
|
||||
if len(name) > 47:
|
||||
@@ -410,75 +439,70 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
print(
|
||||
f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{Colors.END}"
|
||||
)
|
||||
|
||||
row_color = Colors.END
|
||||
sep_color = Colors.YELLOW
|
||||
print(
|
||||
f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}"
|
||||
f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {self.breakmodel_config.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}"
|
||||
)
|
||||
print(
|
||||
f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){Colors.END}"
|
||||
)
|
||||
|
||||
def breakmodel_device_config(self, config):
|
||||
# TODO: Find a better place for this or rework this
|
||||
return
|
||||
|
||||
global breakmodel, generator
|
||||
import breakmodel
|
||||
|
||||
n_layers = utils.num_layers(config)
|
||||
|
||||
logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks))
|
||||
logger.debug("gpu blocks before modification: {}".format(self.breakmodel_config.gpu_blocks))
|
||||
|
||||
if utils.args.cpu:
|
||||
breakmodel.gpu_blocks = [0] * n_layers
|
||||
self.breakmodel_config.gpu_blocks = [0] * n_layers
|
||||
return
|
||||
|
||||
elif breakmodel.gpu_blocks == []:
|
||||
elif self.breakmodel_config.gpu_blocks == []:
|
||||
logger.info("Breakmodel not specified, assuming GPU 0")
|
||||
breakmodel.gpu_blocks = [n_layers]
|
||||
self.breakmodel_config.gpu_blocks = [n_layers]
|
||||
n_layers = 0
|
||||
|
||||
else:
|
||||
s = n_layers
|
||||
for i in range(len(breakmodel.gpu_blocks)):
|
||||
if breakmodel.gpu_blocks[i] <= -1:
|
||||
breakmodel.gpu_blocks[i] = s
|
||||
for i in range(len(self.breakmodel_config.gpu_blocks)):
|
||||
if self.breakmodel_config.gpu_blocks[i] <= -1:
|
||||
self.breakmodel_config.gpu_blocks[i] = s
|
||||
break
|
||||
else:
|
||||
s -= breakmodel.gpu_blocks[i]
|
||||
assert sum(breakmodel.gpu_blocks) <= n_layers
|
||||
n_layers -= sum(breakmodel.gpu_blocks)
|
||||
if breakmodel.disk_blocks is not None:
|
||||
assert breakmodel.disk_blocks <= n_layers
|
||||
n_layers -= breakmodel.disk_blocks
|
||||
s -= self.breakmodel_config.gpu_blocks[i]
|
||||
assert sum(self.breakmodel_config.gpu_blocks) <= n_layers
|
||||
n_layers -= sum(self.breakmodel_config.gpu_blocks)
|
||||
if self.breakmodel_config.disk_blocks is not None:
|
||||
assert self.breakmodel_config.disk_blocks <= n_layers
|
||||
n_layers -= self.breakmodel_config.disk_blocks
|
||||
|
||||
logger.init_ok("Final device configuration:", status="Info")
|
||||
self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
|
||||
self.breakmodel_device_list(n_layers, primary=self.breakmodel_config.primary_device)
|
||||
with open(
|
||||
"settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w"
|
||||
) as file:
|
||||
file.write(
|
||||
"{}\n{}".format(
|
||||
",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks
|
||||
",".join(map(str, self.breakmodel_config.gpu_blocks)), self.breakmodel_config.disk_blocks
|
||||
)
|
||||
)
|
||||
|
||||
# If all layers are on the same device, use the old GPU generation mode
|
||||
while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
|
||||
breakmodel.gpu_blocks.pop()
|
||||
while len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[-1] == 0:
|
||||
self.breakmodel_config.gpu_blocks.pop()
|
||||
self.breakmodel = True
|
||||
if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (
|
||||
if len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[-1] in (
|
||||
-1,
|
||||
utils.num_layers(config),
|
||||
):
|
||||
logger.debug("All layers on same GPU. Breakmodel disabled")
|
||||
self.breakmodel = False
|
||||
self.usegpu = True
|
||||
utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1
|
||||
utils.koboldai_vars.gpu_device = len(self.breakmodel_config.gpu_blocks) - 1
|
||||
return
|
||||
|
||||
if not breakmodel.gpu_blocks:
|
||||
if not self.breakmodel_config.gpu_blocks:
|
||||
logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
|
||||
self.breakmodel = False
|
||||
self.usegpu = False
|
||||
|
Reference in New Issue
Block a user