Files
KoboldAI-Client/modeling/inference_models/hf_torch.py
2023-03-04 19:02:00 -06:00

1055 lines
44 KiB
Python

from __future__ import annotations
import gc
import os
import time
import bisect
import zipfile
import functools
import itertools
import traceback
import contextlib
from tqdm.auto import tqdm
from typing import Dict, List, Union
import torch
from torch.nn import Embedding
import transformers
from transformers import (
StoppingCriteria,
GPTNeoForCausalLM,
AutoModelForCausalLM,
LogitsProcessorList,
LogitsProcessor,
)
import utils
import torch_lazy_loader
from logger import logger, Colors
from modeling import warpers
from modeling import inference_model
from modeling.warpers import Warper
from modeling.stoppers import Stoppers
from modeling.post_token_hooks import PostTokenHooks
from modeling.inference_models.hf import HFInferenceModel
from modeling.inference_model import (
GenerationResult,
GenerationSettings,
InferenceModel,
ModelCapabilities,
use_core_manipulations,
)
try:
import breakmodel
import accelerate.utils
except ModuleNotFoundError as e:
if not utils.koboldai_vars.use_colab_tpu:
raise e
class HFTorchInferenceModel(HFInferenceModel):
def __init__(
self,
model_name: str,
lazy_load: bool,
low_mem: bool,
) -> None:
super().__init__()
self.model_name = model_name
self.lazy_load = lazy_load
self.low_mem = low_mem
self.post_token_hooks = [
Stoppers.core_stopper,
PostTokenHooks.stream_tokens,
Stoppers.dynamic_wi_scanner,
Stoppers.chat_mode_stopper,
]
self.model = None
self.tokenizer = None
self.capabilties = ModelCapabilities(
embedding_manipulation=True,
post_token_hooks=True,
stopper_hooks=True,
post_token_probs=True,
)
self._old_stopping_criteria = None
def _apply_warpers(
self, scores: torch.Tensor, input_ids: torch.Tensor
) -> torch.Tensor:
warpers.update_settings()
for sid in utils.koboldai_vars.sampler_order:
warper = Warper.from_id(sid)
if warper == warpers.RepetitionPenalty:
# Rep pen needs more data than other samplers
scores = warper.torch(scores, input_ids=input_ids)
else:
scores = warper.torch(scores)
return scores
def _post_load(self) -> None:
# Patch stopping_criteria
class PTHStopper(StoppingCriteria):
def __call__(
hf_self,
input_ids: torch.LongTensor,
scores: torch.FloatTensor,
) -> None:
self._post_token_gen(input_ids)
for stopper in self.stopper_hooks:
do_stop = stopper(input_ids)
if do_stop:
return True
return False
old_gsc = transformers.GenerationMixin._get_stopping_criteria
def _get_stopping_criteria(
hf_self,
*args,
**kwargs,
):
stopping_criteria = old_gsc(hf_self, *args, **kwargs)
stopping_criteria.insert(0, PTHStopper())
return stopping_criteria
use_core_manipulations.get_stopping_criteria = _get_stopping_criteria
# Patch logitswarpers
class PhraseBiasLogitsProcessor(LogitsProcessor):
def __init__(self):
pass
def _find_intersection(self, big: List, small: List) -> int:
"""Find the maximum overlap between the beginning of small and the end of big.
Return the index of the token in small following the overlap, or 0.
big: The tokens in the context (as a tensor)
small: The tokens in the phrase to bias (as a list)
Both big and small are in "oldest to newest" order.
"""
# There are asymptotically more efficient methods for determining the overlap,
# but typically there will be few (0-1) instances of small[0] in the last len(small)
# elements of big, plus small will typically be fairly short. So this naive
# approach is acceptable despite O(N^2) worst case performance.
num_small = len(small)
# The small list can only ever match against at most num_small tokens of big,
# so create a slice. Typically, this slice will be as long as small, but it
# may be shorter if the story has just started.
# We need to convert the big slice to list, since natively big is a tensor
# and tensor and list don't ever compare equal. It's better to convert here
# and then use native equality tests than to iterate repeatedly later.
big_slice = list(big[-num_small:])
# It's possible that the start token appears multiple times in small
# For example, consider the phrase:
# [ fair is foul, and foul is fair, hover through the fog and filthy air]
# If we merely look for the first instance of [ fair], then we would
# generate the following output:
# " fair is foul, and foul is fair is foul, and foul is fair..."
start = small[0]
for i, t in enumerate(big_slice):
# Strictly unnecessary, but it's marginally faster to test the first
# token before creating slices to test for a full match.
if t == start:
remaining = len(big_slice) - i
if big_slice[i:] == small[:remaining]:
# We found a match. If the small phrase has any remaining tokens
# then return the index of the next token.
if remaining < num_small:
return remaining
# In this case, the entire small phrase matched, so start over.
return 0
# There were no matches, so just begin at the beginning.
return 0
def _allow_leftwards_tampering(self, phrase: str) -> bool:
"""Determines if a phrase should be tampered with from the left in
the "soft" token encoding mode."""
if phrase[0] in [".", "?", "!", ";", ":", "\n"]:
return False
return True
def _get_token_sequence(self, phrase: str) -> List[List]:
"""Convert the phrase string into a list of encoded biases, each
one being a list of tokens. How this is done is determined by the
phrase's format:
- If the phrase is surrounded by square brackets ([]), the tokens
will be the phrase split by commas (,). If a "token" isn't
actually a number, it will be skipped. NOTE: Tokens output by
this may not be in the model's vocabulary, and such tokens
should be ignored later in the pipeline.
- If the phrase is surrounded by curly brackets ({}), the phrase
will be directly encoded with no synonym biases and no fancy
tricks.
- Otherwise, the phrase will be encoded, with close deviations
being included as synonym biases.
"""
# TODO: Cache these tokens, invalidate when model or bias is
# changed.
# Handle direct token id input
if phrase.startswith("[") and phrase.endswith("]"):
no_brackets = phrase[1:-1]
ret = []
for token_id in no_brackets.split(","):
try:
ret.append(int(token_id))
except ValueError:
# Ignore non-numbers. Rascals!
pass
return [ret]
# Handle direct phrases
if phrase.startswith("{") and phrase.endswith("}"):
no_brackets = phrase[1:-1]
return [inference_model.current_model.tokenizer.encode(no_brackets)]
# Handle untamperable phrases
if not self._allow_leftwards_tampering(phrase):
return [inference_model.current_model.tokenizer.encode(phrase)]
# Handle slight alterations to original phrase
phrase = phrase.strip(" ")
ret = []
for alt_phrase in [phrase, f" {phrase}"]:
ret.append(
inference_model.current_model.tokenizer.encode(alt_phrase)
)
return ret
def _get_biased_tokens(self, input_ids: List) -> Dict:
# TODO: Different "bias slopes"?
ret = {}
for phrase, _bias in utils.koboldai_vars.biases.items():
bias_score, completion_threshold = _bias
token_seqs = self._get_token_sequence(phrase)
variant_deltas = {}
for token_seq in token_seqs:
bias_index = self._find_intersection(input_ids, token_seq)
# Ensure completion after completion_threshold tokens
# Only provide a positive bias when the base bias score is positive.
if bias_score > 0 and bias_index + 1 > completion_threshold:
bias_score = 999
token_to_bias = token_seq[bias_index]
variant_deltas[token_to_bias] = bias_score
# If multiple phrases bias the same token, add the modifiers
# together. This should NOT be applied to automatic variants
for token_to_bias, bias_score in variant_deltas.items():
if token_to_bias in ret:
ret[token_to_bias] += bias_score
else:
ret[token_to_bias] = bias_score
return ret
def __call__(
self, input_ids: torch.LongTensor, scores: torch.FloatTensor
) -> torch.FloatTensor:
assert scores.ndim == 2
assert input_ids.ndim == 2
scores_shape = scores.shape
for batch in range(scores_shape[0]):
for token, bias in self._get_biased_tokens(
input_ids[batch]
).items():
scores[batch][token] += bias
return scores
class LuaLogitsProcessor(LogitsProcessor):
def __init__(self):
pass
def __call__(
self, input_ids: torch.LongTensor, scores: torch.FloatTensor
) -> torch.FloatTensor:
assert scores.ndim == 2
assert input_ids.ndim == 2
self.regeneration_required = False
self.halt = False
if utils.koboldai_vars.standalone:
return scores
scores_shape = scores.shape
scores_list = scores.tolist()
utils.koboldai_vars.lua_koboldbridge.logits = (
utils.koboldai_vars.lua_state.table()
)
for r, row in enumerate(scores_list):
utils.koboldai_vars.lua_koboldbridge.logits[
r + 1
] = utils.koboldai_vars.lua_state.table(*row)
utils.koboldai_vars.lua_koboldbridge.vocab_size = scores_shape[-1]
utils.koboldai_vars.lua_koboldbridge.execute_genmod()
scores = torch.Tensor(
tuple(
tuple(row.values())
for row in utils.koboldai_vars.lua_koboldbridge.logits.values()
),
device=scores.device,
dtype=scores.dtype,
)
assert scores.shape == scores_shape
return scores
from torch.nn import functional as F
def visualize_probabilities(
model: InferenceModel,
scores: torch.FloatTensor,
) -> None:
assert scores.ndim == 2
if utils.koboldai_vars.numseqs > 1 or not utils.koboldai_vars.show_probs:
return
if not utils.koboldai_vars.show_probs:
return scores
option_offset = 0
if (
utils.koboldai_vars.actions.action_count + 1
in utils.koboldai_vars.actions.actions
):
for x in range(
len(
utils.koboldai_vars.actions.actions[
utils.koboldai_vars.actions.action_count + 1
]["Options"]
)
):
option = utils.koboldai_vars.actions.actions[
utils.koboldai_vars.actions.action_count + 1
]["Options"][x]
if (
option["Pinned"]
or option["Previous Selection"]
or option["Edited"]
):
option_offset = x + 1
batch_offset = (
int(
(utils.koboldai_vars.generated_tkns - 1)
/ utils.koboldai_vars.genamt
)
if utils.koboldai_vars.alt_multi_gen
else 0
)
for batch_index, batch in enumerate(scores):
probs = F.softmax(batch, dim=-1).cpu().numpy()
token_prob_info = []
for token_id, score in sorted(
enumerate(probs), key=lambda x: x[1], reverse=True
)[:8]:
token_prob_info.append(
{
"tokenId": token_id,
"decoded": utils.decodenewlines(
model.tokenizer.decode(token_id)
),
"score": float(score),
}
)
if utils.koboldai_vars.numseqs == 1:
utils.koboldai_vars.actions.set_probabilities(token_prob_info)
else:
utils.koboldai_vars.actions.set_option_probabilities(
token_prob_info, batch_index + option_offset + batch_offset
)
return scores
def new_get_logits_processor(*args, **kwargs) -> LogitsProcessorList:
processors = new_get_logits_processor.old_get_logits_processor(
*args, **kwargs
)
# TODOB4MERGE: These two
# processors.insert(0, LuaLogitsProcessor())
# processors.append(PhraseBiasLogitsProcessor())
return processors
use_core_manipulations.get_logits_processor = new_get_logits_processor
new_get_logits_processor.old_get_logits_processor = (
transformers.GenerationMixin._get_logits_processor
)
class KoboldLogitsWarperList(LogitsProcessorList):
def __init__(self):
pass
def __call__(
lw_self,
input_ids: torch.LongTensor,
scores: torch.FloatTensor,
*args,
**kwargs,
):
# sampler_order = utils.koboldai_vars.sampler_order[:]
# if (
# len(sampler_order) < 7
# ): # Add repetition penalty at beginning if it's not present
# sampler_order = [6] + sampler_order
# for k in sampler_order:
# scores = self.__warper_list[k](input_ids, scores, *args, **kwargs)
scores = self._apply_warpers(scores=scores, input_ids=input_ids)
visualize_probabilities(inference_model.current_model, scores)
return scores
def new_get_logits_warper(
beams: int = 1,
) -> LogitsProcessorList:
return KoboldLogitsWarperList()
def new_sample(self, *args, **kwargs):
assert kwargs.pop("logits_warper", None) is not None
kwargs["logits_warper"] = new_get_logits_warper(
beams=1,
)
if utils.koboldai_vars.newlinemode in ["s", "ns"]:
kwargs["eos_token_id"] = -1
kwargs.setdefault("pad_token_id", 2)
return new_sample.old_sample(self, *args, **kwargs)
new_sample.old_sample = transformers.GenerationMixin.sample
use_core_manipulations.sample = new_sample
def _raw_generate(
self,
prompt_tokens: Union[List[int], torch.Tensor],
max_new: int,
gen_settings: GenerationSettings,
single_line: bool = False,
batch_count: int = 1,
**kwargs
) -> GenerationResult:
if not isinstance(prompt_tokens, torch.Tensor):
gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
else:
gen_in = prompt_tokens
device = utils.get_auxilary_device()
gen_in = gen_in.to(device)
additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else []
with torch.no_grad():
start_time = time.time()
genout = self.model.generate(
gen_in,
do_sample=True,
max_length=min(
len(prompt_tokens) + max_new, utils.koboldai_vars.max_length
),
repetition_penalty=1.0,
bad_words_ids=utils.koboldai_vars.badwordsids
+ additional_bad_words_ids,
use_cache=True,
num_return_sequences=batch_count,
)
logger.debug(
"torch_raw_generate: run generator {}s".format(time.time() - start_time)
)
return GenerationResult(
self,
out_batches=genout,
prompt=prompt_tokens,
is_whole_generation=False,
output_includes_prompt=True,
)
def _get_model(self, location: str, tf_kwargs: Dict):
try:
return AutoModelForCausalLM.from_pretrained(
location,
revision=utils.koboldai_vars.revision,
cache_dir="cache",
**tf_kwargs,
)
except Exception as e:
if "out of memory" in traceback.format_exc().lower():
raise RuntimeError(
"One of your GPUs ran out of memory when KoboldAI tried to load your model."
)
return GPTNeoForCausalLM.from_pretrained(
location,
revision=utils.koboldai_vars.revision,
cache_dir="cache",
**tf_kwargs,
)
def get_hidden_size(self) -> int:
return self.model.get_input_embeddings().embedding_dim
def _move_to_devices(self) -> None:
if not utils.koboldai_vars.breakmodel:
if utils.koboldai_vars.usegpu:
self.model = self.model.half().to(utils.koboldai_vars.gpu_device)
else:
self.model = self.model.to("cpu").float()
return
for key, value in self.model.state_dict().items():
target_dtype = (
torch.float32 if breakmodel.primary_device == "cpu" else torch.float16
)
if value.dtype is not target_dtype:
accelerate.utils.set_module_tensor_to_device(
self.model, key, target_dtype
)
disk_blocks = breakmodel.disk_blocks
gpu_blocks = breakmodel.gpu_blocks
ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks)
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
device_map = {}
for name in utils.layers_module_names:
layer = int(name.rsplit(".", 1)[1])
device = (
("disk" if layer < disk_blocks else "cpu")
if layer < ram_blocks
else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
)
device_map[name] = device
for name in utils.get_missing_module_names(self.model, list(device_map.keys())):
device_map[name] = breakmodel.primary_device
breakmodel.dispatch_model_ex(
self.model,
device_map,
main_device=breakmodel.primary_device,
offload_buffers=True,
offload_dir="accelerate-disk-cache",
)
gc.collect()
return
# Function to patch transformers to use our soft prompt
def patch_embedding(self) -> None:
if getattr(Embedding, "_koboldai_patch_causallm_model", None):
Embedding._koboldai_patch_causallm_model = self.model
return
old_embedding_call = Embedding.__call__
kai_model = self
def new_embedding_call(self, input_ids, *args, **kwargs):
# Don't touch embeddings for models other than the core inference model (that's us!)
if (
Embedding._koboldai_patch_causallm_model.get_input_embeddings()
is not self
):
return old_embedding_call(self, input_ids, *args, **kwargs)
assert input_ids is not None
if utils.koboldai_vars.sp is not None:
shifted_input_ids = input_ids - kai_model.model.config.vocab_size
input_ids.clamp_(max=kai_model.model.config.vocab_size - 1)
inputs_embeds = old_embedding_call(self, input_ids, *args, **kwargs)
if utils.koboldai_vars.sp is not None:
utils.koboldai_vars.sp = utils.koboldai_vars.sp.to(
inputs_embeds.dtype
).to(inputs_embeds.device)
inputs_embeds = torch.where(
(shifted_input_ids >= 0)[..., None],
utils.koboldai_vars.sp[shifted_input_ids.clamp(min=0)],
inputs_embeds,
)
return inputs_embeds
Embedding.__call__ = new_embedding_call
Embedding._koboldai_patch_causallm_model = self.model
def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True):
if not self.lazy_load:
return
if utils.args.breakmodel_disklayers is not None:
breakmodel.disk_blocks = utils.args.breakmodel_disklayers
disk_blocks = breakmodel.disk_blocks
gpu_blocks = breakmodel.gpu_blocks
ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
def lazy_load_callback(
model_dict: Dict[str, Union[torch_lazy_loader.LazyTensor, torch.Tensor]],
f,
**_,
):
if lazy_load_callback.nested:
return
lazy_load_callback.nested = True
device_map: Dict[str, Union[str, int]] = {}
@functools.lru_cache(maxsize=None)
def get_original_key(key):
return max(
(
original_key
for original_key in utils.module_names
if original_key.endswith(key)
),
key=len,
)
for key, value in model_dict.items():
original_key = get_original_key(key)
if isinstance(value, torch_lazy_loader.LazyTensor) and not any(
original_key.startswith(n) for n in utils.layers_module_names
):
device_map[key] = (
utils.koboldai_vars.gpu_device
if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu
else "cpu"
if not utils.koboldai_vars.hascuda
or not utils.koboldai_vars.breakmodel
else breakmodel.primary_device
)
else:
layer = int(
max(
(
n
for n in utils.layers_module_names
if original_key.startswith(n)
),
key=len,
).rsplit(".", 1)[1]
)
device = (
utils.koboldai_vars.gpu_device
if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu
else "disk"
if layer < disk_blocks and layer < ram_blocks
else "cpu"
if not utils.koboldai_vars.hascuda
or not utils.koboldai_vars.breakmodel
else "shared"
if layer < ram_blocks
else bisect.bisect_right(
cumulative_gpu_blocks, layer - ram_blocks
)
)
device_map[key] = device
if utils.num_shards is None or utils.current_shard == 0:
utils.offload_index = {}
if os.path.isdir("accelerate-disk-cache"):
# Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder
# (the folder doesn't contain any subfolders so os.remove will do just fine)
for filename in os.listdir("accelerate-disk-cache"):
try:
os.remove(os.path.join("accelerate-disk-cache", filename))
except OSError:
pass
os.makedirs("accelerate-disk-cache", exist_ok=True)
if utils.num_shards is not None:
num_tensors = len(
utils.get_sharded_checkpoint_num_tensors(
utils.from_pretrained_model_name,
utils.from_pretrained_index_filename,
**utils.from_pretrained_kwargs,
)
)
else:
num_tensors = len(device_map)
print(flush=True)
utils.koboldai_vars.status_message = "Loading model"
utils.koboldai_vars.total_layers = num_tensors
utils.koboldai_vars.loaded_layers = 0
utils.bar = tqdm(
total=num_tensors,
desc="Loading model tensors",
file=utils.UIProgressBarFile(),
)
with zipfile.ZipFile(f, "r") as z:
try:
last_storage_key = None
zipfolder = os.path.basename(os.path.normpath(f)).split(".")[0]
f = None
current_offset = 0
able_to_pin_layers = True
if utils.num_shards is not None:
utils.current_shard += 1
for key in sorted(
device_map.keys(),
key=lambda k: (model_dict[k].key, model_dict[k].seek_offset),
):
storage_key = model_dict[key].key
if (
storage_key != last_storage_key
or model_dict[key].seek_offset < current_offset
):
last_storage_key = storage_key
if isinstance(f, zipfile.ZipExtFile):
f.close()
try:
f = z.open(f"archive/data/{storage_key}")
except:
f = z.open(f"{zipfolder}/data/{storage_key}")
current_offset = 0
if current_offset != model_dict[key].seek_offset:
f.read(model_dict[key].seek_offset - current_offset)
current_offset = model_dict[key].seek_offset
device = device_map[key]
size = functools.reduce(
lambda x, y: x * y, model_dict[key].shape, 1
)
dtype = model_dict[key].dtype
nbytes = (
size
if dtype is torch.bool
else size
* (
(
torch.finfo
if dtype.is_floating_point
else torch.iinfo
)(dtype).bits
>> 3
)
)
# print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
model_dict[key] = model_dict[key].materialize(
f, map_location="cpu"
)
if model_dict[key].dtype is torch.float32:
utils.koboldai_vars.fp32_model = True
if (
convert_to_float16
and breakmodel.primary_device != "cpu"
and utils.koboldai_vars.hascuda
and (
utils.koboldai_vars.breakmodel
or utils.koboldai_vars.usegpu
)
and model_dict[key].dtype is torch.float32
):
model_dict[key] = model_dict[key].to(torch.float16)
if breakmodel.primary_device == "cpu" or (
not utils.koboldai_vars.usegpu
and not utils.koboldai_vars.breakmodel
and model_dict[key].dtype is torch.float16
):
model_dict[key] = model_dict[key].to(torch.float32)
if device == "shared":
model_dict[key] = model_dict[key].to("cpu").detach_()
if able_to_pin_layers:
try:
model_dict[key] = model_dict[key].pin_memory()
except:
able_to_pin_layers = False
elif device == "disk":
accelerate.utils.offload_weight(
model_dict[key],
get_original_key(key),
"accelerate-disk-cache",
index=utils.offload_index,
)
model_dict[key] = model_dict[key].to("meta")
else:
model_dict[key] = model_dict[key].to(device)
# print("OK", flush=True)
current_offset += nbytes
utils.bar.update(1)
utils.koboldai_vars.loaded_layers += 1
finally:
if (
utils.num_shards is None
or utils.current_shard >= utils.num_shards
):
if utils.offload_index:
for name, tensor in utils.named_buffers:
dtype = tensor.dtype
if (
convert_to_float16
and breakmodel.primary_device != "cpu"
and utils.koboldai_vars.hascuda
and (
utils.koboldai_vars.breakmodel
or utils.koboldai_vars.usegpu
)
):
dtype = torch.float16
if breakmodel.primary_device == "cpu" or (
not utils.koboldai_vars.usegpu
and not utils.koboldai_vars.breakmodel
):
dtype = torch.float32
if (
name in model_dict
and model_dict[name].dtype is not dtype
):
model_dict[name] = model_dict[name].to(dtype)
if tensor.dtype is not dtype:
tensor = tensor.to(dtype)
if name not in utils.offload_index:
accelerate.utils.offload_weight(
tensor,
name,
"accelerate-disk-cache",
index=utils.offload_index,
)
accelerate.utils.save_offload_index(
utils.offload_index, "accelerate-disk-cache"
)
utils.bar.close()
utils.bar = None
utils.koboldai_vars.status_message = ""
lazy_load_callback.nested = False
if isinstance(f, zipfile.ZipExtFile):
f.close()
lazy_load_callback.nested = False
return lazy_load_callback
@contextlib.contextmanager
def _maybe_use_float16(self, always_use: bool = False):
if always_use or (
utils.koboldai_vars.hascuda
and self.low_mem
and (utils.koboldai_vars.usegpu or utils.koboldai_vars.breakmodel)
):
original_dtype = torch.get_default_dtype()
torch.set_default_dtype(torch.float16)
yield True
torch.set_default_dtype(original_dtype)
else:
yield False
def breakmodel_device_list(self, n_layers, primary=None, selected=None):
# TODO: Find a better place for this or rework this
device_count = torch.cuda.device_count()
if device_count < 2:
primary = None
gpu_blocks = breakmodel.gpu_blocks + (
device_count - len(breakmodel.gpu_blocks)
) * [0]
print(f"{Colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{Colors.END}")
for i in range(device_count):
name = torch.cuda.get_device_name(i)
if len(name) > 47:
name = "..." + name[-44:]
row_color = Colors.END
sep_color = Colors.YELLOW
print(
f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{Colors.END}"
)
row_color = Colors.END
sep_color = Colors.YELLOW
print(
f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}"
)
print(
f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){Colors.END}"
)
def breakmodel_device_config(self, config):
# TODO: Find a better place for this or rework this
global breakmodel, generator
import breakmodel
n_layers = utils.num_layers(config)
if utils.args.cpu:
breakmodel.gpu_blocks = [0] * n_layers
return
elif (
utils.args.breakmodel_gpulayers is not None
or utils.args.breakmodel_disklayers is not None
):
try:
if not utils.args.breakmodel_gpulayers:
breakmodel.gpu_blocks = []
else:
breakmodel.gpu_blocks = list(
map(int, utils.args.breakmodel_gpulayers.split(","))
)
assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
s = n_layers
for i in range(len(breakmodel.gpu_blocks)):
if breakmodel.gpu_blocks[i] <= -1:
breakmodel.gpu_blocks[i] = s
break
else:
s -= breakmodel.gpu_blocks[i]
assert sum(breakmodel.gpu_blocks) <= n_layers
n_layers -= sum(breakmodel.gpu_blocks)
if utils.args.breakmodel_disklayers is not None:
assert utils.args.breakmodel_disklayers <= n_layers
breakmodel.disk_blocks = utils.args.breakmodel_disklayers
n_layers -= utils.args.breakmodel_disklayers
except:
logger.warning(
"--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0."
)
breakmodel.gpu_blocks = [n_layers]
n_layers = 0
elif utils.args.breakmodel_layers is not None:
breakmodel.gpu_blocks = [
n_layers - max(0, min(n_layers, utils.args.breakmodel_layers))
]
n_layers -= sum(breakmodel.gpu_blocks)
elif utils.args.model is not None:
logger.info("Breakmodel not specified, assuming GPU 0")
breakmodel.gpu_blocks = [n_layers]
n_layers = 0
else:
device_count = torch.cuda.device_count()
if device_count > 1:
print(
Colors.CYAN
+ "\nPlease select one of your GPUs to be your primary GPU."
)
print(
"VRAM usage in your primary GPU will be higher than for your other ones."
)
print("It is recommended you make your fastest GPU your primary GPU.")
self.breakmodel_device_list(n_layers)
while True:
primaryselect = input("device ID> ")
if (
primaryselect.isnumeric()
and 0 <= int(primaryselect) < device_count
):
breakmodel.primary_device = int(primaryselect)
break
else:
print(
f"{Colors.RED}Please enter an integer between 0 and {device_count-1}.{Colors.END}"
)
else:
breakmodel.primary_device = 0
print(
Colors.PURPLE
+ "\nIf you don't have enough VRAM to run the model on a single GPU"
)
print(
"you can split the model between your CPU and your GPU(s), or between"
)
print("multiple GPUs if you have more than one.")
print("By putting more 'layers' on a GPU or CPU, more computations will be")
print(
"done on that device and more VRAM or RAM will be required on that device"
)
print("(roughly proportional to number of layers).")
print(
"It should be noted that GPUs are orders of magnitude faster than the CPU."
)
print(
f"This model has{Colors.YELLOW} {n_layers} {Colors.PURPLE}layers.{Colors.END}\n"
)
for i in range(device_count):
self.breakmodel_device_list(
n_layers, primary=breakmodel.primary_device, selected=i
)
print(
f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
)
while True:
layerselect = input("# of layers> ")
if (
layerselect.isnumeric() or layerselect.strip() == "-1"
) and -1 <= int(layerselect) <= n_layers:
layerselect = int(layerselect)
layerselect = n_layers if layerselect == -1 else layerselect
breakmodel.gpu_blocks.append(layerselect)
n_layers -= layerselect
break
else:
print(
f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
)
if n_layers == 0:
break
if n_layers > 0:
self.breakmodel_device_list(
n_layers, primary=breakmodel.primary_device, selected=-1
)
print(
f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
)
while True:
layerselect = input("# of layers> ")
if (
layerselect.isnumeric() or layerselect.strip() == "-1"
) and -1 <= int(layerselect) <= n_layers:
layerselect = int(layerselect)
layerselect = n_layers if layerselect == -1 else layerselect
breakmodel.disk_blocks = layerselect
n_layers -= layerselect
break
else:
print(
f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
)
logger.init_ok("Final device configuration:", status="Info")
self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
# If all layers are on the same device, use the old GPU generation mode
while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
breakmodel.gpu_blocks.pop()
if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (
-1,
utils.num_layers(config),
):
utils.koboldai_vars.breakmodel = False
utils.koboldai_vars.usegpu = True
utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1
return
if not breakmodel.gpu_blocks:
logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
import breakmodel
breakmodel.primary_device = "cpu"
utils.koboldai_vars.breakmodel = False
utils.koboldai_vars.usegpu = False
return