mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Basic breakmodel ui support
Seems to work
This commit is contained in:
@@ -59,8 +59,6 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
# Also, lazy loader doesn't support GPT-2 models
|
# Also, lazy loader doesn't support GPT-2 models
|
||||||
self.lazy_load = False
|
self.lazy_load = False
|
||||||
|
|
||||||
# If we're using torch_lazy_loader, we need to get breakmodel config
|
|
||||||
# early so that it knows where to load the individual model tensors
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(
|
"lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(
|
||||||
self.lazy_load,
|
self.lazy_load,
|
||||||
@@ -70,6 +68,16 @@ class model_backend(HFTorchInferenceModel):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If we're using torch_lazy_loader, we need to get breakmodel config
|
||||||
|
# early so that it knows where to load the individual model tensors
|
||||||
|
if (
|
||||||
|
self.lazy_load
|
||||||
|
and utils.koboldai_vars.hascuda
|
||||||
|
and utils.koboldai_vars.breakmodel
|
||||||
|
and not utils.koboldai_vars.nobreakmodel
|
||||||
|
):
|
||||||
|
self.breakmodel_device_config(self.model_config)
|
||||||
|
|
||||||
if self.lazy_load:
|
if self.lazy_load:
|
||||||
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
# torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time
|
||||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
tf_kwargs.pop("low_cpu_mem_usage", None)
|
||||||
|
@@ -1,17 +1,13 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import gc
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import bisect
|
import bisect
|
||||||
import zipfile
|
|
||||||
import functools
|
|
||||||
import itertools
|
import itertools
|
||||||
import traceback
|
import traceback
|
||||||
import contextlib
|
import contextlib
|
||||||
from accelerate.big_modeling import load_checkpoint_and_dispatch
|
from torch import nn
|
||||||
from accelerate.utils.modeling import infer_auto_device_map, load_checkpoint_in_model
|
|
||||||
from tqdm.auto import tqdm
|
|
||||||
from typing import Dict, List, Optional, Union
|
from typing import Dict, List, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -41,17 +37,36 @@ from modeling.inference_model import (
|
|||||||
use_core_manipulations,
|
use_core_manipulations,
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
|
||||||
import accelerate.utils
|
|
||||||
except ModuleNotFoundError as e:
|
|
||||||
if not utils.koboldai_vars.use_colab_tpu:
|
|
||||||
raise e
|
|
||||||
|
|
||||||
# When set to true, messages will appear in the console if samplers are not
|
# When set to true, messages will appear in the console if samplers are not
|
||||||
# changing the scores. Keep in mind some samplers don't always change the
|
# changing the scores. Keep in mind some samplers don't always change the
|
||||||
# scores for each token.
|
# scores for each token.
|
||||||
LOG_SAMPLER_NO_EFFECT = False
|
LOG_SAMPLER_NO_EFFECT = False
|
||||||
|
|
||||||
|
class BreakmodelConfig:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.disk_blocks = 0
|
||||||
|
self.gpu_blocks = []
|
||||||
|
self.primary_device = 0 if torch.cuda.device_count() > 0 else "cpu"
|
||||||
|
|
||||||
|
def get_device_map(self, model: nn.Module) -> dict:
|
||||||
|
ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks)
|
||||||
|
cumulative_gpu_blocks = tuple(itertools.accumulate(self.gpu_blocks))
|
||||||
|
device_map = {}
|
||||||
|
|
||||||
|
for name in utils.layers_module_names:
|
||||||
|
layer = int(name.rsplit(".", 1)[1])
|
||||||
|
device = (
|
||||||
|
("disk" if layer < self.disk_blocks else "cpu")
|
||||||
|
if layer < ram_blocks
|
||||||
|
else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
|
||||||
|
)
|
||||||
|
device_map[name] = device
|
||||||
|
|
||||||
|
for name in utils.get_missing_module_names(model, list(device_map.keys())):
|
||||||
|
device_map[name] = self.primary_device
|
||||||
|
|
||||||
|
return device_map
|
||||||
|
|
||||||
|
|
||||||
class HFTorchInferenceModel(HFInferenceModel):
|
class HFTorchInferenceModel(HFInferenceModel):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
@@ -80,6 +95,16 @@ class HFTorchInferenceModel(HFInferenceModel):
|
|||||||
post_token_probs=True,
|
post_token_probs=True,
|
||||||
)
|
)
|
||||||
self._old_stopping_criteria = None
|
self._old_stopping_criteria = None
|
||||||
|
self.breakmodel_config = BreakmodelConfig()
|
||||||
|
|
||||||
|
def set_input_parameters(self, parameters):
|
||||||
|
ret = super().set_input_parameters(parameters)
|
||||||
|
|
||||||
|
# Hook onto input param setting for setting breakmodel stuff
|
||||||
|
self.breakmodel_config.gpu_blocks = self.layers
|
||||||
|
self.breakmodel_config.disk_blocks = self.disk_layers
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
def _apply_warpers(
|
def _apply_warpers(
|
||||||
self, scores: torch.Tensor, input_ids: torch.Tensor
|
self, scores: torch.Tensor, input_ids: torch.Tensor
|
||||||
@@ -278,17 +303,20 @@ class HFTorchInferenceModel(HFInferenceModel):
|
|||||||
|
|
||||||
# Try to determine model type from either AutoModel or falling back to legacy
|
# Try to determine model type from either AutoModel or falling back to legacy
|
||||||
try:
|
try:
|
||||||
print("[HUGE SKELETON] LOADING FROM PRETRAINED")
|
with lazy_loader.use_lazy_load(dematerialized_modules=True):
|
||||||
|
metamodel = AutoModelForCausalLM.from_config(self.model_config)
|
||||||
|
device_map = self.breakmodel_config.get_device_map(metamodel)
|
||||||
|
|
||||||
with lazy_loader.use_lazy_load(
|
with lazy_loader.use_lazy_load(
|
||||||
enable=True,
|
enable=True,
|
||||||
# DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!!
|
# DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!!
|
||||||
# dematerialized_modules=True,
|
|
||||||
dematerialized_modules=False,
|
dematerialized_modules=False,
|
||||||
):
|
):
|
||||||
|
print(device_map)
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
location,
|
location,
|
||||||
device_map="auto",
|
# device_map="auto",
|
||||||
# max_memory={0: "10GiB", 1: "7GiB", "cpu": "20GiB"},
|
device_map=device_map,
|
||||||
offload_folder="accelerate-disk-cache",
|
offload_folder="accelerate-disk-cache",
|
||||||
torch_dtype=torch.float16,
|
torch_dtype=torch.float16,
|
||||||
**tf_kwargs,
|
**tf_kwargs,
|
||||||
@@ -389,18 +417,19 @@ class HFTorchInferenceModel(HFInferenceModel):
|
|||||||
yield False
|
yield False
|
||||||
|
|
||||||
def breakmodel_device_list(self, n_layers, primary=None, selected=None):
|
def breakmodel_device_list(self, n_layers, primary=None, selected=None):
|
||||||
return
|
|
||||||
# TODO: Find a better place for this or rework this
|
|
||||||
|
|
||||||
device_count = torch.cuda.device_count()
|
device_count = torch.cuda.device_count()
|
||||||
if device_count < 2:
|
if device_count < 2:
|
||||||
primary = None
|
primary = None
|
||||||
|
|
||||||
logger.debug("n_layers: {}".format(n_layers))
|
logger.debug("n_layers: {}".format(n_layers))
|
||||||
logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks))
|
logger.debug("gpu blocks: {}".format(self.breakmodel_config.gpu_blocks))
|
||||||
gpu_blocks = breakmodel.gpu_blocks + (
|
|
||||||
device_count - len(breakmodel.gpu_blocks)
|
gpu_blocks = self.breakmodel_config.gpu_blocks + (
|
||||||
|
device_count - len(self.breakmodel_config.gpu_blocks)
|
||||||
) * [0]
|
) * [0]
|
||||||
|
|
||||||
print(f"{Colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{Colors.END}")
|
print(f"{Colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{Colors.END}")
|
||||||
|
|
||||||
for i in range(device_count):
|
for i in range(device_count):
|
||||||
name = torch.cuda.get_device_name(i)
|
name = torch.cuda.get_device_name(i)
|
||||||
if len(name) > 47:
|
if len(name) > 47:
|
||||||
@@ -410,75 +439,70 @@ class HFTorchInferenceModel(HFInferenceModel):
|
|||||||
print(
|
print(
|
||||||
f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{Colors.END}"
|
f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{Colors.END}"
|
||||||
)
|
)
|
||||||
|
|
||||||
row_color = Colors.END
|
row_color = Colors.END
|
||||||
sep_color = Colors.YELLOW
|
sep_color = Colors.YELLOW
|
||||||
print(
|
print(
|
||||||
f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}"
|
f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {self.breakmodel_config.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}"
|
||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){Colors.END}"
|
f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){Colors.END}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def breakmodel_device_config(self, config):
|
def breakmodel_device_config(self, config):
|
||||||
# TODO: Find a better place for this or rework this
|
|
||||||
return
|
|
||||||
|
|
||||||
global breakmodel, generator
|
|
||||||
import breakmodel
|
|
||||||
|
|
||||||
n_layers = utils.num_layers(config)
|
n_layers = utils.num_layers(config)
|
||||||
|
|
||||||
logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks))
|
logger.debug("gpu blocks before modification: {}".format(self.breakmodel_config.gpu_blocks))
|
||||||
|
|
||||||
if utils.args.cpu:
|
if utils.args.cpu:
|
||||||
breakmodel.gpu_blocks = [0] * n_layers
|
self.breakmodel_config.gpu_blocks = [0] * n_layers
|
||||||
return
|
return
|
||||||
|
|
||||||
elif breakmodel.gpu_blocks == []:
|
elif self.breakmodel_config.gpu_blocks == []:
|
||||||
logger.info("Breakmodel not specified, assuming GPU 0")
|
logger.info("Breakmodel not specified, assuming GPU 0")
|
||||||
breakmodel.gpu_blocks = [n_layers]
|
self.breakmodel_config.gpu_blocks = [n_layers]
|
||||||
n_layers = 0
|
n_layers = 0
|
||||||
|
|
||||||
else:
|
else:
|
||||||
s = n_layers
|
s = n_layers
|
||||||
for i in range(len(breakmodel.gpu_blocks)):
|
for i in range(len(self.breakmodel_config.gpu_blocks)):
|
||||||
if breakmodel.gpu_blocks[i] <= -1:
|
if self.breakmodel_config.gpu_blocks[i] <= -1:
|
||||||
breakmodel.gpu_blocks[i] = s
|
self.breakmodel_config.gpu_blocks[i] = s
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
s -= breakmodel.gpu_blocks[i]
|
s -= self.breakmodel_config.gpu_blocks[i]
|
||||||
assert sum(breakmodel.gpu_blocks) <= n_layers
|
assert sum(self.breakmodel_config.gpu_blocks) <= n_layers
|
||||||
n_layers -= sum(breakmodel.gpu_blocks)
|
n_layers -= sum(self.breakmodel_config.gpu_blocks)
|
||||||
if breakmodel.disk_blocks is not None:
|
if self.breakmodel_config.disk_blocks is not None:
|
||||||
assert breakmodel.disk_blocks <= n_layers
|
assert self.breakmodel_config.disk_blocks <= n_layers
|
||||||
n_layers -= breakmodel.disk_blocks
|
n_layers -= self.breakmodel_config.disk_blocks
|
||||||
|
|
||||||
logger.init_ok("Final device configuration:", status="Info")
|
logger.init_ok("Final device configuration:", status="Info")
|
||||||
self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
|
self.breakmodel_device_list(n_layers, primary=self.breakmodel_config.primary_device)
|
||||||
with open(
|
with open(
|
||||||
"settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w"
|
"settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w"
|
||||||
) as file:
|
) as file:
|
||||||
file.write(
|
file.write(
|
||||||
"{}\n{}".format(
|
"{}\n{}".format(
|
||||||
",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks
|
",".join(map(str, self.breakmodel_config.gpu_blocks)), self.breakmodel_config.disk_blocks
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# If all layers are on the same device, use the old GPU generation mode
|
# If all layers are on the same device, use the old GPU generation mode
|
||||||
while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
|
while len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[-1] == 0:
|
||||||
breakmodel.gpu_blocks.pop()
|
self.breakmodel_config.gpu_blocks.pop()
|
||||||
self.breakmodel = True
|
self.breakmodel = True
|
||||||
if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (
|
if len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[-1] in (
|
||||||
-1,
|
-1,
|
||||||
utils.num_layers(config),
|
utils.num_layers(config),
|
||||||
):
|
):
|
||||||
logger.debug("All layers on same GPU. Breakmodel disabled")
|
logger.debug("All layers on same GPU. Breakmodel disabled")
|
||||||
self.breakmodel = False
|
self.breakmodel = False
|
||||||
self.usegpu = True
|
self.usegpu = True
|
||||||
utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1
|
utils.koboldai_vars.gpu_device = len(self.breakmodel_config.gpu_blocks) - 1
|
||||||
return
|
return
|
||||||
|
|
||||||
if not breakmodel.gpu_blocks:
|
if not self.breakmodel_config.gpu_blocks:
|
||||||
logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
|
logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
|
||||||
self.breakmodel = False
|
self.breakmodel = False
|
||||||
self.usegpu = False
|
self.usegpu = False
|
||||||
|
Reference in New Issue
Block a user