mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Was causing issues with disk cache the old code had a `and not utils.HAS_ACCELERATE` preceding it (a variable which no longer exists), and since disk cache is accelerate only, there was no disk handling code in here. Anyway its bad so blast it
962 lines
40 KiB
Python
962 lines
40 KiB
Python
from __future__ import annotations
|
|
|
|
import gc
|
|
import os
|
|
import time
|
|
import bisect
|
|
import zipfile
|
|
import functools
|
|
import itertools
|
|
import traceback
|
|
import contextlib
|
|
from tqdm.auto import tqdm
|
|
from typing import Dict, List, Optional, Union
|
|
|
|
import torch
|
|
from torch.nn import Embedding
|
|
import transformers
|
|
from transformers import (
|
|
StoppingCriteria,
|
|
GPTNeoForCausalLM,
|
|
AutoModelForCausalLM,
|
|
LogitsProcessorList,
|
|
)
|
|
|
|
import utils
|
|
import modeling.lazy_loader as lazy_loader
|
|
from logger import logger, Colors
|
|
|
|
from modeling import warpers
|
|
from modeling.warpers import Warper
|
|
from modeling.stoppers import Stoppers
|
|
from modeling.post_token_hooks import PostTokenHooks
|
|
from modeling.inference_models.hf import HFInferenceModel
|
|
from modeling.inference_model import (
|
|
GenerationResult,
|
|
GenerationSettings,
|
|
ModelCapabilities,
|
|
use_core_manipulations,
|
|
)
|
|
|
|
try:
|
|
import breakmodel
|
|
import accelerate.utils
|
|
except ModuleNotFoundError as e:
|
|
if not utils.koboldai_vars.use_colab_tpu:
|
|
raise e
|
|
|
|
# When set to true, messages will appear in the console if samplers are not
|
|
# changing the scores. Keep in mind some samplers don't always change the
|
|
# scores for each token.
|
|
LOG_SAMPLER_NO_EFFECT = False
|
|
|
|
|
|
class HFTorchInferenceModel(HFInferenceModel):
|
|
def __init__(
|
|
self,
|
|
model_name: str,
|
|
lazy_load: bool,
|
|
low_mem: bool,
|
|
) -> None:
|
|
super().__init__(model_name)
|
|
self.lazy_load = lazy_load
|
|
self.low_mem = low_mem
|
|
|
|
self.post_token_hooks = [
|
|
PostTokenHooks.stream_tokens,
|
|
]
|
|
|
|
self.stopper_hooks = [
|
|
Stoppers.core_stopper,
|
|
Stoppers.dynamic_wi_scanner,
|
|
Stoppers.singleline_stopper,
|
|
Stoppers.chat_mode_stopper,
|
|
]
|
|
|
|
self.capabilties = ModelCapabilities(
|
|
embedding_manipulation=True,
|
|
post_token_hooks=True,
|
|
stopper_hooks=True,
|
|
post_token_probs=True,
|
|
)
|
|
self._old_stopping_criteria = None
|
|
|
|
def _apply_warpers(
|
|
self, scores: torch.Tensor, input_ids: torch.Tensor
|
|
) -> torch.Tensor:
|
|
warpers.update_settings()
|
|
|
|
if LOG_SAMPLER_NO_EFFECT:
|
|
pre = torch.Tensor(scores)
|
|
|
|
for sid in utils.koboldai_vars.sampler_order:
|
|
warper = Warper.from_id(sid)
|
|
|
|
if not warper.value_is_valid():
|
|
continue
|
|
|
|
if warper == warpers.RepetitionPenalty:
|
|
# Rep pen needs more data than other samplers
|
|
scores = warper.torch(scores, input_ids=input_ids)
|
|
else:
|
|
scores = warper.torch(scores)
|
|
|
|
assert scores is not None, f"Scores are None; warper '{warper}' is to blame"
|
|
|
|
if LOG_SAMPLER_NO_EFFECT:
|
|
if torch.equal(pre, scores):
|
|
logger.info(warper, "had no effect on the scores.")
|
|
pre = torch.Tensor(scores)
|
|
return scores
|
|
|
|
def get_model_type(self) -> str:
|
|
if not self.model_config:
|
|
return "Read Only"
|
|
|
|
if not isinstance(self.model_config, dict):
|
|
return str(self.model_config.model_type)
|
|
|
|
model_type = self.model_config.get("model_type")
|
|
|
|
if model_type:
|
|
return model_type
|
|
|
|
if utils.koboldai_vars.mode.endswith("gpt2"):
|
|
return "gpt2"
|
|
else:
|
|
return "Unknown"
|
|
|
|
def _post_load(m_self) -> None:
|
|
if not utils.koboldai_vars.model_type:
|
|
utils.koboldai_vars.model_type = m_self.get_model_type()
|
|
|
|
# Model specific overrides if a model has bad defaults
|
|
if utils.koboldai_vars.model_type == "llama":
|
|
m_self.tokenizer.decode_with_prefix_space = True
|
|
m_self.tokenizer.add_bos_token = False
|
|
|
|
# Patch stopping_criteria
|
|
class PTHStopper(StoppingCriteria):
|
|
def __call__(
|
|
hf_self,
|
|
input_ids: torch.LongTensor,
|
|
scores: torch.FloatTensor,
|
|
) -> None:
|
|
m_self._post_token_gen(input_ids)
|
|
|
|
for stopper in m_self.stopper_hooks:
|
|
do_stop = stopper(m_self, input_ids)
|
|
if do_stop:
|
|
return True
|
|
return False
|
|
|
|
old_gsc = transformers.GenerationMixin._get_stopping_criteria
|
|
|
|
def _get_stopping_criteria(
|
|
hf_self,
|
|
*args,
|
|
**kwargs,
|
|
):
|
|
stopping_criteria = old_gsc(hf_self, *args, **kwargs)
|
|
stopping_criteria.insert(0, PTHStopper())
|
|
return stopping_criteria
|
|
|
|
use_core_manipulations.get_stopping_criteria = _get_stopping_criteria
|
|
|
|
# Patch logitswarpers
|
|
|
|
def new_get_logits_processor(*args, **kwargs) -> LogitsProcessorList:
|
|
processors = new_get_logits_processor.old_get_logits_processor(
|
|
*args, **kwargs
|
|
)
|
|
return processors
|
|
|
|
use_core_manipulations.get_logits_processor = new_get_logits_processor
|
|
new_get_logits_processor.old_get_logits_processor = (
|
|
transformers.GenerationMixin._get_logits_processor
|
|
)
|
|
|
|
class KoboldLogitsWarperList(LogitsProcessorList):
|
|
def __init__(self):
|
|
pass
|
|
|
|
def __call__(
|
|
lw_self,
|
|
input_ids: torch.LongTensor,
|
|
scores: torch.FloatTensor,
|
|
*args,
|
|
**kwargs,
|
|
):
|
|
scores = m_self._apply_warpers(scores=scores, input_ids=input_ids)
|
|
|
|
for processor in m_self.logits_processors:
|
|
scores = processor(m_self, scores=scores, input_ids=input_ids)
|
|
assert (
|
|
scores is not None
|
|
), f"Scores are None; processor '{processor}' is to blame"
|
|
return scores
|
|
|
|
def new_get_logits_warper(
|
|
beams: int = 1,
|
|
) -> LogitsProcessorList:
|
|
return KoboldLogitsWarperList()
|
|
|
|
def new_sample(self, *args, **kwargs):
|
|
assert kwargs.pop("logits_warper", None) is not None
|
|
kwargs["logits_warper"] = new_get_logits_warper(
|
|
beams=1,
|
|
)
|
|
if utils.koboldai_vars.newlinemode in ["s", "ns"]:
|
|
kwargs["eos_token_id"] = -1
|
|
kwargs.setdefault("pad_token_id", 2)
|
|
return new_sample.old_sample(self, *args, **kwargs)
|
|
|
|
new_sample.old_sample = transformers.GenerationMixin.sample
|
|
use_core_manipulations.sample = new_sample
|
|
|
|
def _raw_generate(
|
|
self,
|
|
prompt_tokens: Union[List[int], torch.Tensor],
|
|
max_new: int,
|
|
gen_settings: GenerationSettings,
|
|
single_line: bool = False,
|
|
batch_count: int = 1,
|
|
seed: Optional[int] = None,
|
|
**kwargs,
|
|
) -> GenerationResult:
|
|
if not isinstance(prompt_tokens, torch.Tensor):
|
|
gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
|
|
else:
|
|
gen_in = prompt_tokens
|
|
|
|
device = utils.get_auxilary_device()
|
|
gen_in = gen_in.to(device)
|
|
|
|
additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else []
|
|
|
|
if seed is not None:
|
|
torch.manual_seed(seed)
|
|
|
|
with torch.no_grad():
|
|
start_time = time.time()
|
|
genout = self.model.generate(
|
|
gen_in,
|
|
do_sample=True,
|
|
max_length=min(
|
|
len(prompt_tokens) + max_new, utils.koboldai_vars.max_length
|
|
),
|
|
repetition_penalty=1.0,
|
|
bad_words_ids=utils.koboldai_vars.badwordsids
|
|
+ additional_bad_words_ids,
|
|
use_cache=True,
|
|
num_return_sequences=batch_count,
|
|
)
|
|
logger.debug(
|
|
"torch_raw_generate: run generator {}s".format(time.time() - start_time)
|
|
)
|
|
|
|
return GenerationResult(
|
|
self,
|
|
out_batches=genout,
|
|
prompt=prompt_tokens,
|
|
is_whole_generation=False,
|
|
output_includes_prompt=True,
|
|
)
|
|
|
|
def _get_model(self, location: str, tf_kwargs: Dict):
|
|
try:
|
|
return AutoModelForCausalLM.from_pretrained(
|
|
location,
|
|
revision=utils.koboldai_vars.revision,
|
|
cache_dir="cache",
|
|
**tf_kwargs,
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Fell back to GPTNeoForCausalLM due to {e}")
|
|
|
|
if "out of memory" in traceback.format_exc().lower():
|
|
raise RuntimeError(
|
|
"One of your GPUs ran out of memory when KoboldAI tried to load your model."
|
|
)
|
|
|
|
return GPTNeoForCausalLM.from_pretrained(
|
|
location,
|
|
revision=utils.koboldai_vars.revision,
|
|
cache_dir="cache",
|
|
**tf_kwargs,
|
|
)
|
|
|
|
def get_hidden_size(self) -> int:
|
|
return self.model.get_input_embeddings().embedding_dim
|
|
|
|
def _will_load_with_safetensors(self) -> bool:
|
|
path = self.get_local_model_path()
|
|
|
|
# TODO: This might mess up download to run
|
|
if not path:
|
|
return False
|
|
|
|
if not os.path.exists(os.path.join(path, "model.safetensors")):
|
|
return False
|
|
|
|
return True
|
|
|
|
def _move_to_devices(self) -> None:
|
|
for key, value in self.model.state_dict().items():
|
|
target_dtype = (
|
|
torch.float32 if breakmodel.primary_device == "cpu" else torch.float16
|
|
)
|
|
if value.dtype is not target_dtype:
|
|
accelerate.utils.set_module_tensor_to_device(
|
|
self.model,
|
|
tensor_name=key,
|
|
device=torch.device(value.device),
|
|
value=value,
|
|
dtype=target_dtype,
|
|
)
|
|
|
|
disk_blocks = breakmodel.disk_blocks
|
|
gpu_blocks = breakmodel.gpu_blocks
|
|
ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks)
|
|
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
|
|
device_map = {}
|
|
|
|
for name in utils.layers_module_names:
|
|
layer = int(name.rsplit(".", 1)[1])
|
|
device = (
|
|
("disk" if layer < disk_blocks else "cpu")
|
|
if layer < ram_blocks
|
|
else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
|
|
)
|
|
device_map[name] = device
|
|
|
|
for name in utils.get_missing_module_names(self.model, list(device_map.keys())):
|
|
device_map[name] = breakmodel.primary_device
|
|
|
|
breakmodel.dispatch_model_ex(
|
|
self.model,
|
|
device_map,
|
|
main_device=breakmodel.primary_device,
|
|
offload_buffers=True,
|
|
offload_dir="accelerate-disk-cache",
|
|
)
|
|
|
|
gc.collect()
|
|
return
|
|
|
|
# Function to patch transformers to use our soft prompt
|
|
def patch_embedding(self) -> None:
|
|
if getattr(Embedding, "_koboldai_patch_causallm_model", None):
|
|
Embedding._koboldai_patch_causallm_model = self.model
|
|
return
|
|
|
|
old_embedding_call = Embedding.__call__
|
|
|
|
kai_model = self
|
|
|
|
def new_embedding_call(self, input_ids, *args, **kwargs):
|
|
# Don't touch embeddings for models other than the core inference model (that's us!)
|
|
if (
|
|
Embedding._koboldai_patch_causallm_model.get_input_embeddings()
|
|
is not self
|
|
):
|
|
return old_embedding_call(self, input_ids, *args, **kwargs)
|
|
|
|
assert input_ids is not None
|
|
|
|
if utils.koboldai_vars.sp is not None:
|
|
shifted_input_ids = input_ids - kai_model.model.config.vocab_size
|
|
|
|
input_ids.clamp_(max=kai_model.model.config.vocab_size - 1)
|
|
inputs_embeds = old_embedding_call(self, input_ids, *args, **kwargs)
|
|
|
|
if utils.koboldai_vars.sp is not None:
|
|
utils.koboldai_vars.sp = utils.koboldai_vars.sp.to(
|
|
inputs_embeds.dtype
|
|
).to(inputs_embeds.device)
|
|
inputs_embeds = torch.where(
|
|
(shifted_input_ids >= 0)[..., None],
|
|
utils.koboldai_vars.sp[shifted_input_ids.clamp(min=0)],
|
|
inputs_embeds,
|
|
)
|
|
|
|
return inputs_embeds
|
|
|
|
Embedding.__call__ = new_embedding_call
|
|
Embedding._koboldai_patch_causallm_model = self.model
|
|
|
|
def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True):
|
|
if not self.lazy_load:
|
|
return
|
|
|
|
if utils.args.breakmodel_disklayers is not None:
|
|
breakmodel.disk_blocks = utils.args.breakmodel_disklayers
|
|
|
|
disk_blocks = breakmodel.disk_blocks
|
|
gpu_blocks = breakmodel.gpu_blocks
|
|
ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
|
|
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
|
|
|
|
def lazy_load_callback(
|
|
model_dict: Dict[str, Union[lazy_loader.LazyTensor, torch.Tensor]],
|
|
f,
|
|
is_safetensors: bool = False,
|
|
**_,
|
|
):
|
|
if lazy_load_callback.nested:
|
|
return
|
|
lazy_load_callback.nested = True
|
|
|
|
device_map: Dict[str, Union[str, int]] = {}
|
|
|
|
@functools.lru_cache(maxsize=None)
|
|
def get_original_key(key):
|
|
return max(
|
|
(
|
|
original_key
|
|
for original_key in utils.module_names
|
|
if original_key.endswith(key)
|
|
),
|
|
key=len,
|
|
)
|
|
|
|
for key, value in model_dict.items():
|
|
original_key = get_original_key(key)
|
|
|
|
if isinstance(value, lazy_loader.LazyTensor) and not any(
|
|
original_key.startswith(n) for n in utils.layers_module_names
|
|
):
|
|
device_map[key] = (
|
|
utils.koboldai_vars.gpu_device
|
|
if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu
|
|
else "cpu"
|
|
if not utils.koboldai_vars.hascuda
|
|
or not utils.koboldai_vars.breakmodel
|
|
else breakmodel.primary_device
|
|
)
|
|
else:
|
|
layer = int(
|
|
max(
|
|
(
|
|
n
|
|
for n in utils.layers_module_names
|
|
if original_key.startswith(n)
|
|
),
|
|
key=len,
|
|
).rsplit(".", 1)[1]
|
|
)
|
|
device = (
|
|
utils.koboldai_vars.gpu_device
|
|
if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu
|
|
else "disk"
|
|
if layer < disk_blocks and layer < ram_blocks
|
|
else "cpu"
|
|
if not utils.koboldai_vars.hascuda
|
|
or not utils.koboldai_vars.breakmodel
|
|
else "shared"
|
|
if layer < ram_blocks
|
|
else bisect.bisect_right(
|
|
cumulative_gpu_blocks, layer - ram_blocks
|
|
)
|
|
)
|
|
device_map[key] = device
|
|
|
|
if utils.num_shards is None or utils.current_shard == 0:
|
|
utils.offload_index = {}
|
|
if os.path.isdir("accelerate-disk-cache"):
|
|
# Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder
|
|
# (the folder doesn't contain any subfolders so os.remove will do just fine)
|
|
for filename in os.listdir("accelerate-disk-cache"):
|
|
try:
|
|
os.remove(os.path.join("accelerate-disk-cache", filename))
|
|
except OSError:
|
|
pass
|
|
os.makedirs("accelerate-disk-cache", exist_ok=True)
|
|
if utils.num_shards is not None:
|
|
num_tensors = len(
|
|
utils.get_sharded_checkpoint_num_tensors(
|
|
utils.from_pretrained_model_name,
|
|
utils.from_pretrained_index_filename,
|
|
**utils.from_pretrained_kwargs,
|
|
)
|
|
)
|
|
else:
|
|
num_tensors = len(device_map)
|
|
print(flush=True)
|
|
utils.koboldai_vars.status_message = "Loading model"
|
|
utils.koboldai_vars.total_layers = num_tensors
|
|
utils.koboldai_vars.loaded_layers = 0
|
|
utils.bar = tqdm(
|
|
total=num_tensors,
|
|
desc="Loading model tensors",
|
|
file=utils.UIProgressBarFile(),
|
|
position=1
|
|
)
|
|
|
|
if not is_safetensors:
|
|
# Torch lazyload
|
|
with zipfile.ZipFile(f, "r") as z:
|
|
try:
|
|
last_storage_key = None
|
|
zipfolder = os.path.basename(os.path.normpath(f)).split(".")[0]
|
|
f = None
|
|
current_offset = 0
|
|
able_to_pin_layers = True
|
|
if utils.num_shards is not None:
|
|
utils.current_shard += 1
|
|
for key in sorted(
|
|
device_map.keys(),
|
|
key=lambda k: (
|
|
model_dict[k].key,
|
|
model_dict[k].seek_offset,
|
|
),
|
|
):
|
|
storage_key = model_dict[key].key
|
|
if (
|
|
storage_key != last_storage_key
|
|
or model_dict[key].seek_offset < current_offset
|
|
):
|
|
last_storage_key = storage_key
|
|
if isinstance(f, zipfile.ZipExtFile):
|
|
f.close()
|
|
try:
|
|
f = z.open(f"archive/data/{storage_key}")
|
|
except:
|
|
f = z.open(f"{zipfolder}/data/{storage_key}")
|
|
current_offset = 0
|
|
if current_offset != model_dict[key].seek_offset:
|
|
f.read(model_dict[key].seek_offset - current_offset)
|
|
current_offset = model_dict[key].seek_offset
|
|
device = device_map[key]
|
|
size = functools.reduce(
|
|
lambda x, y: x * y, model_dict[key].shape, 1
|
|
)
|
|
dtype = model_dict[key].dtype
|
|
nbytes = (
|
|
size
|
|
if dtype is torch.bool
|
|
else size
|
|
* (
|
|
(
|
|
torch.finfo
|
|
if dtype.is_floating_point
|
|
else torch.iinfo
|
|
)(dtype).bits
|
|
>> 3
|
|
)
|
|
)
|
|
# print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
|
|
model_dict[key] = model_dict[key].materialize(
|
|
f, map_location="cpu"
|
|
)
|
|
if model_dict[key].dtype is torch.float32:
|
|
utils.koboldai_vars.fp32_model = True
|
|
if (
|
|
convert_to_float16
|
|
and breakmodel.primary_device != "cpu"
|
|
and utils.koboldai_vars.hascuda
|
|
and (
|
|
utils.koboldai_vars.breakmodel
|
|
or utils.koboldai_vars.usegpu
|
|
)
|
|
and model_dict[key].dtype is torch.float32
|
|
):
|
|
model_dict[key] = model_dict[key].to(torch.float16)
|
|
if breakmodel.primary_device == "cpu" or (
|
|
not utils.koboldai_vars.usegpu
|
|
and not utils.koboldai_vars.breakmodel
|
|
and model_dict[key].dtype is torch.float16
|
|
):
|
|
model_dict[key] = model_dict[key].to(torch.float32)
|
|
if device == "shared":
|
|
model_dict[key] = model_dict[key].to("cpu").detach_()
|
|
if able_to_pin_layers:
|
|
try:
|
|
model_dict[key] = model_dict[key].pin_memory()
|
|
except:
|
|
able_to_pin_layers = False
|
|
elif device == "disk":
|
|
accelerate.utils.offload_weight(
|
|
model_dict[key],
|
|
get_original_key(key),
|
|
"accelerate-disk-cache",
|
|
index=utils.offload_index,
|
|
)
|
|
model_dict[key] = model_dict[key].to("meta")
|
|
else:
|
|
model_dict[key] = model_dict[key].to(device)
|
|
# print("OK", flush=True)
|
|
current_offset += nbytes
|
|
utils.bar.update(1)
|
|
utils.koboldai_vars.loaded_layers += 1
|
|
finally:
|
|
if (
|
|
utils.num_shards is None
|
|
or utils.current_shard >= utils.num_shards
|
|
):
|
|
if utils.offload_index:
|
|
for name, tensor in utils.named_buffers:
|
|
dtype = tensor.dtype
|
|
if (
|
|
convert_to_float16
|
|
and breakmodel.primary_device != "cpu"
|
|
and utils.koboldai_vars.hascuda
|
|
and (
|
|
utils.koboldai_vars.breakmodel
|
|
or utils.koboldai_vars.usegpu
|
|
)
|
|
):
|
|
dtype = torch.float16
|
|
if breakmodel.primary_device == "cpu" or (
|
|
not utils.koboldai_vars.usegpu
|
|
and not utils.koboldai_vars.breakmodel
|
|
):
|
|
dtype = torch.float32
|
|
if (
|
|
name in model_dict
|
|
and model_dict[name].dtype is not dtype
|
|
):
|
|
model_dict[name] = model_dict[name].to(dtype)
|
|
if tensor.dtype is not dtype:
|
|
tensor = tensor.to(dtype)
|
|
if name not in utils.offload_index:
|
|
accelerate.utils.offload_weight(
|
|
tensor,
|
|
name,
|
|
"accelerate-disk-cache",
|
|
index=utils.offload_index,
|
|
)
|
|
accelerate.utils.save_offload_index(
|
|
utils.offload_index, "accelerate-disk-cache"
|
|
)
|
|
utils.bar.close()
|
|
utils.bar = None
|
|
utils.koboldai_vars.status_message = ""
|
|
lazy_load_callback.nested = False
|
|
if isinstance(f, zipfile.ZipExtFile):
|
|
f.close()
|
|
else:
|
|
# Loading with safetensors
|
|
try:
|
|
able_to_pin_layers = True
|
|
|
|
if utils.num_shards is not None:
|
|
utils.current_shard += 1
|
|
|
|
for key in sorted(
|
|
device_map.keys(),
|
|
key=lambda k: model_dict[k].key,
|
|
):
|
|
storage_key = model_dict[key].key
|
|
|
|
device = device_map[key]
|
|
|
|
# print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
|
|
|
|
model_dict[key] = model_dict[key].materialize(
|
|
f, map_location="cpu"
|
|
)
|
|
|
|
if model_dict[key].dtype is torch.float32:
|
|
utils.koboldai_vars.fp32_model = True
|
|
|
|
if (
|
|
convert_to_float16
|
|
and breakmodel.primary_device != "cpu"
|
|
and utils.koboldai_vars.hascuda
|
|
and (
|
|
utils.koboldai_vars.breakmodel
|
|
or utils.koboldai_vars.usegpu
|
|
)
|
|
and model_dict[key].dtype is torch.float32
|
|
):
|
|
model_dict[key] = model_dict[key].to(torch.float16)
|
|
|
|
if breakmodel.primary_device == "cpu" or (
|
|
not utils.koboldai_vars.usegpu
|
|
and not utils.koboldai_vars.breakmodel
|
|
and model_dict[key].dtype is torch.float16
|
|
):
|
|
model_dict[key] = model_dict[key].to(torch.float32)
|
|
|
|
if device == "shared":
|
|
model_dict[key] = model_dict[key].to("cpu").detach_()
|
|
if able_to_pin_layers:
|
|
try:
|
|
model_dict[key] = model_dict[key].pin_memory()
|
|
except:
|
|
able_to_pin_layers = False
|
|
elif device == "disk":
|
|
accelerate.utils.offload_weight(
|
|
model_dict[key],
|
|
get_original_key(key),
|
|
"accelerate-disk-cache",
|
|
index=utils.offload_index,
|
|
)
|
|
model_dict[key] = model_dict[key].to("meta")
|
|
else:
|
|
model_dict[key] = model_dict[key].to(device)
|
|
|
|
utils.bar.update(1)
|
|
utils.koboldai_vars.loaded_layers += 1
|
|
|
|
finally:
|
|
if (
|
|
utils.num_shards is None
|
|
or utils.current_shard >= utils.num_shards
|
|
):
|
|
if utils.offload_index:
|
|
for name, tensor in utils.named_buffers:
|
|
dtype = tensor.dtype
|
|
if (
|
|
convert_to_float16
|
|
and breakmodel.primary_device != "cpu"
|
|
and utils.koboldai_vars.hascuda
|
|
and (
|
|
utils.koboldai_vars.breakmodel
|
|
or utils.koboldai_vars.usegpu
|
|
)
|
|
):
|
|
dtype = torch.float16
|
|
if breakmodel.primary_device == "cpu" or (
|
|
not utils.koboldai_vars.usegpu
|
|
and not utils.koboldai_vars.breakmodel
|
|
):
|
|
dtype = torch.float32
|
|
if (
|
|
name in model_dict
|
|
and model_dict[name].dtype is not dtype
|
|
):
|
|
model_dict[name] = model_dict[name].to(dtype)
|
|
if tensor.dtype is not dtype:
|
|
tensor = tensor.to(dtype)
|
|
if name not in utils.offload_index:
|
|
accelerate.utils.offload_weight(
|
|
tensor,
|
|
name,
|
|
"accelerate-disk-cache",
|
|
index=utils.offload_index,
|
|
)
|
|
accelerate.utils.save_offload_index(
|
|
utils.offload_index, "accelerate-disk-cache"
|
|
)
|
|
utils.bar.close()
|
|
utils.bar = None
|
|
utils.koboldai_vars.status_message = ""
|
|
|
|
lazy_load_callback.nested = False
|
|
|
|
lazy_load_callback.nested = False
|
|
return lazy_load_callback
|
|
|
|
@contextlib.contextmanager
|
|
def _maybe_use_float16(self, always_use: bool = False):
|
|
if always_use or (
|
|
utils.koboldai_vars.hascuda
|
|
and self.low_mem
|
|
and (utils.koboldai_vars.usegpu or utils.koboldai_vars.breakmodel)
|
|
):
|
|
original_dtype = torch.get_default_dtype()
|
|
torch.set_default_dtype(torch.float16)
|
|
yield True
|
|
torch.set_default_dtype(original_dtype)
|
|
else:
|
|
yield False
|
|
|
|
def breakmodel_device_list(self, n_layers, primary=None, selected=None):
|
|
# TODO: Find a better place for this or rework this
|
|
|
|
device_count = torch.cuda.device_count()
|
|
if device_count < 2:
|
|
primary = None
|
|
gpu_blocks = breakmodel.gpu_blocks + (
|
|
device_count - len(breakmodel.gpu_blocks)
|
|
) * [0]
|
|
print(f"{Colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{Colors.END}")
|
|
for i in range(device_count):
|
|
name = torch.cuda.get_device_name(i)
|
|
if len(name) > 47:
|
|
name = "..." + name[-44:]
|
|
row_color = Colors.END
|
|
sep_color = Colors.YELLOW
|
|
print(
|
|
f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{Colors.END}"
|
|
)
|
|
row_color = Colors.END
|
|
sep_color = Colors.YELLOW
|
|
print(
|
|
f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}"
|
|
)
|
|
print(
|
|
f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){Colors.END}"
|
|
)
|
|
|
|
def breakmodel_device_config(self, config):
|
|
# TODO: Find a better place for this or rework this
|
|
|
|
global breakmodel, generator
|
|
import breakmodel
|
|
|
|
n_layers = utils.num_layers(config)
|
|
|
|
if utils.args.cpu:
|
|
breakmodel.gpu_blocks = [0] * n_layers
|
|
return
|
|
|
|
elif (
|
|
utils.args.breakmodel_gpulayers is not None
|
|
or utils.args.breakmodel_disklayers is not None
|
|
):
|
|
try:
|
|
if not utils.args.breakmodel_gpulayers:
|
|
breakmodel.gpu_blocks = []
|
|
else:
|
|
breakmodel.gpu_blocks = list(
|
|
map(int, utils.args.breakmodel_gpulayers.split(","))
|
|
)
|
|
assert len(breakmodel.gpu_blocks) <= torch.cuda.device_count()
|
|
s = n_layers
|
|
for i in range(len(breakmodel.gpu_blocks)):
|
|
if breakmodel.gpu_blocks[i] <= -1:
|
|
breakmodel.gpu_blocks[i] = s
|
|
break
|
|
else:
|
|
s -= breakmodel.gpu_blocks[i]
|
|
assert sum(breakmodel.gpu_blocks) <= n_layers
|
|
n_layers -= sum(breakmodel.gpu_blocks)
|
|
if utils.args.breakmodel_disklayers is not None:
|
|
assert utils.args.breakmodel_disklayers <= n_layers
|
|
breakmodel.disk_blocks = utils.args.breakmodel_disklayers
|
|
n_layers -= utils.args.breakmodel_disklayers
|
|
except:
|
|
logger.warning(
|
|
"--breakmodel_gpulayers is malformatted. Please use the --help option to see correct usage of --breakmodel_gpulayers. Defaulting to all layers on device 0."
|
|
)
|
|
breakmodel.gpu_blocks = [n_layers]
|
|
n_layers = 0
|
|
elif utils.args.breakmodel_layers is not None:
|
|
breakmodel.gpu_blocks = [
|
|
n_layers - max(0, min(n_layers, utils.args.breakmodel_layers))
|
|
]
|
|
n_layers -= sum(breakmodel.gpu_blocks)
|
|
elif utils.args.model is not None:
|
|
logger.info("Breakmodel not specified, assuming GPU 0")
|
|
breakmodel.gpu_blocks = [n_layers]
|
|
n_layers = 0
|
|
else:
|
|
device_count = torch.cuda.device_count()
|
|
if device_count > 1:
|
|
print(
|
|
Colors.CYAN
|
|
+ "\nPlease select one of your GPUs to be your primary GPU."
|
|
)
|
|
print(
|
|
"VRAM usage in your primary GPU will be higher than for your other ones."
|
|
)
|
|
print("It is recommended you make your fastest GPU your primary GPU.")
|
|
self.breakmodel_device_list(n_layers)
|
|
while True:
|
|
primaryselect = input("device ID> ")
|
|
if (
|
|
primaryselect.isnumeric()
|
|
and 0 <= int(primaryselect) < device_count
|
|
):
|
|
breakmodel.primary_device = int(primaryselect)
|
|
break
|
|
else:
|
|
print(
|
|
f"{Colors.RED}Please enter an integer between 0 and {device_count-1}.{Colors.END}"
|
|
)
|
|
else:
|
|
breakmodel.primary_device = 0
|
|
|
|
print(
|
|
Colors.PURPLE
|
|
+ "\nIf you don't have enough VRAM to run the model on a single GPU"
|
|
)
|
|
print(
|
|
"you can split the model between your CPU and your GPU(s), or between"
|
|
)
|
|
print("multiple GPUs if you have more than one.")
|
|
print("By putting more 'layers' on a GPU or CPU, more computations will be")
|
|
print(
|
|
"done on that device and more VRAM or RAM will be required on that device"
|
|
)
|
|
print("(roughly proportional to number of layers).")
|
|
print(
|
|
"It should be noted that GPUs are orders of magnitude faster than the CPU."
|
|
)
|
|
print(
|
|
f"This model has{Colors.YELLOW} {n_layers} {Colors.PURPLE}layers.{Colors.END}\n"
|
|
)
|
|
|
|
for i in range(device_count):
|
|
self.breakmodel_device_list(
|
|
n_layers, primary=breakmodel.primary_device, selected=i
|
|
)
|
|
print(
|
|
f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into device {i}?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
|
|
)
|
|
while True:
|
|
layerselect = input("# of layers> ")
|
|
if (
|
|
layerselect.isnumeric() or layerselect.strip() == "-1"
|
|
) and -1 <= int(layerselect) <= n_layers:
|
|
layerselect = int(layerselect)
|
|
layerselect = n_layers if layerselect == -1 else layerselect
|
|
breakmodel.gpu_blocks.append(layerselect)
|
|
n_layers -= layerselect
|
|
break
|
|
else:
|
|
print(
|
|
f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
|
|
)
|
|
if n_layers == 0:
|
|
break
|
|
|
|
if n_layers > 0:
|
|
self.breakmodel_device_list(
|
|
n_layers, primary=breakmodel.primary_device, selected=-1
|
|
)
|
|
print(
|
|
f"{Colors.CYAN}\nHow many of the remaining{Colors.YELLOW} {n_layers} {Colors.CYAN}layers would you like to put into the disk cache?\nYou can also enter -1 to allocate all remaining layers to this device.{Colors.END}\n"
|
|
)
|
|
while True:
|
|
layerselect = input("# of layers> ")
|
|
if (
|
|
layerselect.isnumeric() or layerselect.strip() == "-1"
|
|
) and -1 <= int(layerselect) <= n_layers:
|
|
layerselect = int(layerselect)
|
|
layerselect = n_layers if layerselect == -1 else layerselect
|
|
breakmodel.disk_blocks = layerselect
|
|
n_layers -= layerselect
|
|
break
|
|
else:
|
|
print(
|
|
f"{Colors.RED}Please enter an integer between -1 and {n_layers}.{Colors.END}"
|
|
)
|
|
|
|
logger.init_ok("Final device configuration:", status="Info")
|
|
self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
|
|
|
|
# If all layers are on the same device, use the old GPU generation mode
|
|
while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
|
|
breakmodel.gpu_blocks.pop()
|
|
if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (
|
|
-1,
|
|
utils.num_layers(config),
|
|
):
|
|
utils.koboldai_vars.breakmodel = False
|
|
utils.koboldai_vars.usegpu = True
|
|
utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1
|
|
return
|
|
|
|
if not breakmodel.gpu_blocks:
|
|
logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
|
|
import breakmodel
|
|
|
|
breakmodel.primary_device = "cpu"
|
|
utils.koboldai_vars.breakmodel = False
|
|
utils.koboldai_vars.usegpu = False
|
|
return
|