mirror of
https://github.com/KoboldAI/KoboldAI-Client.git
synced 2025-06-05 21:59:24 +02:00
Pull upstream changes, fix conflicts
This commit is contained in:
@@ -1,15 +1,13 @@
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
|
||||
import gc
|
||||
import os
|
||||
import time
|
||||
import bisect
|
||||
import zipfile
|
||||
import functools
|
||||
import itertools
|
||||
import traceback
|
||||
import contextlib
|
||||
from tqdm.auto import tqdm
|
||||
from torch import nn
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import torch
|
||||
@@ -39,19 +37,52 @@ from modeling.inference_model import (
|
||||
use_core_manipulations,
|
||||
)
|
||||
|
||||
try:
|
||||
import breakmodel
|
||||
import accelerate.utils
|
||||
except ModuleNotFoundError as e:
|
||||
if not utils.koboldai_vars.use_colab_tpu:
|
||||
raise e
|
||||
|
||||
# When set to true, messages will appear in the console if samplers are not
|
||||
# changing the scores. Keep in mind some samplers don't always change the
|
||||
# scores for each token.
|
||||
LOG_SAMPLER_NO_EFFECT = False
|
||||
|
||||
|
||||
class BreakmodelConfig:
|
||||
def __init__(self) -> None:
|
||||
self.disk_blocks = 0
|
||||
self.gpu_blocks = []
|
||||
|
||||
@property
|
||||
def primary_device(self):
|
||||
if utils.args.cpu:
|
||||
return "cpu"
|
||||
elif not sum(self.gpu_blocks):
|
||||
# No blocks are on GPU
|
||||
return "cpu"
|
||||
elif torch.cuda.device_count() <= 0:
|
||||
return "cpu"
|
||||
|
||||
for device_index, blocks in enumerate(self.gpu_blocks):
|
||||
if blocks:
|
||||
return device_index
|
||||
return 0
|
||||
|
||||
def get_device_map(self, model: nn.Module) -> dict:
|
||||
ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks)
|
||||
cumulative_gpu_blocks = tuple(itertools.accumulate(self.gpu_blocks))
|
||||
device_map = {}
|
||||
|
||||
for name in utils.layers_module_names:
|
||||
layer = int(name.rsplit(".", 1)[1])
|
||||
device = (
|
||||
("disk" if layer < self.disk_blocks else "cpu")
|
||||
if layer < ram_blocks
|
||||
else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
|
||||
)
|
||||
device_map[name] = device
|
||||
|
||||
for name in utils.get_missing_module_names(model, list(device_map.keys())):
|
||||
device_map[name] = self.primary_device
|
||||
|
||||
return device_map
|
||||
|
||||
|
||||
class HFTorchInferenceModel(HFInferenceModel):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
@@ -79,6 +110,29 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
post_token_probs=True,
|
||||
)
|
||||
self._old_stopping_criteria = None
|
||||
self.breakmodel_config = BreakmodelConfig()
|
||||
|
||||
def set_input_parameters(self, parameters):
|
||||
ret = super().set_input_parameters(parameters)
|
||||
|
||||
# Hook onto input param setting for setting breakmodel stuff
|
||||
if self.breakmodel:
|
||||
self.breakmodel_config.gpu_blocks = self.layers
|
||||
self.breakmodel_config.disk_blocks = self.disk_layers
|
||||
|
||||
return ret
|
||||
|
||||
def get_auxilary_device(self) -> Union[str, int, torch.device]:
|
||||
return self.breakmodel_config.primary_device
|
||||
|
||||
def _get_target_dtype(self) -> Union[torch.float16, torch.float32]:
|
||||
if self.breakmodel_config.primary_device == "cpu":
|
||||
return torch.float32
|
||||
elif utils.args.cpu:
|
||||
return torch.float32
|
||||
elif not self.usegpu and not self.breakmodel:
|
||||
return torch.float32
|
||||
return torch.float16
|
||||
|
||||
def _apply_warpers(
|
||||
self, scores: torch.Tensor, input_ids: torch.Tensor
|
||||
@@ -125,19 +179,7 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
else:
|
||||
return "Unknown"
|
||||
|
||||
def get_auxilary_device(self):
|
||||
"""Get device auxilary tensors like inputs should be stored on."""
|
||||
|
||||
# NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU.
|
||||
if utils.koboldai_vars.hascuda and self.usegpu:
|
||||
return utils.koboldai_vars.gpu_device
|
||||
elif utils.koboldai_vars.hascuda and self.breakmodel:
|
||||
import breakmodel
|
||||
return breakmodel.primary_device
|
||||
return "cpu"
|
||||
|
||||
def _post_load(m_self) -> None:
|
||||
|
||||
if not utils.koboldai_vars.model_type:
|
||||
utils.koboldai_vars.model_type = m_self.get_model_type()
|
||||
|
||||
@@ -220,6 +262,40 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
new_sample.old_sample = transformers.GenerationMixin.sample
|
||||
use_core_manipulations.sample = new_sample
|
||||
|
||||
# PEFT Loading. This MUST be done after all save_pretrained calls are
|
||||
# finished on the main model.
|
||||
if utils.args.peft:
|
||||
from peft import PeftModel, PeftConfig
|
||||
local_peft_dir = os.path.join(m_self.get_local_model_path(), "peft")
|
||||
|
||||
# Make PEFT dir if it doesn't exist
|
||||
try:
|
||||
os.makedirs(local_peft_dir)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
peft_local_path = os.path.join(local_peft_dir, utils.args.peft.replace("/", "_"))
|
||||
logger.debug(f"Loading PEFT '{utils.args.peft}', possible local path is '{peft_local_path}'.")
|
||||
|
||||
peft_installed_locally = True
|
||||
possible_peft_locations = [peft_local_path, utils.args.peft]
|
||||
|
||||
for i, location in enumerate(possible_peft_locations):
|
||||
try:
|
||||
m_self.model = PeftModel.from_pretrained(m_self.model, location)
|
||||
logger.debug(f"Loaded PEFT at '{location}'")
|
||||
break
|
||||
except ValueError:
|
||||
peft_installed_locally = False
|
||||
if i == len(possible_peft_locations) - 1:
|
||||
raise RuntimeError(f"Unable to load PeftModel for given name '{utils.args.peft}'. Does it exist?")
|
||||
except RuntimeError:
|
||||
raise RuntimeError("Error while loading PeftModel. Are you using the correct model?")
|
||||
|
||||
if not peft_installed_locally:
|
||||
logger.debug(f"PEFT not saved to models folder; saving to '{peft_local_path}'")
|
||||
m_self.model.save_pretrained(peft_local_path)
|
||||
|
||||
return super()._post_load()
|
||||
|
||||
def _raw_generate(
|
||||
@@ -236,9 +312,11 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
|
||||
else:
|
||||
gen_in = prompt_tokens
|
||||
|
||||
device = self.get_auxilary_device()
|
||||
gen_in = gen_in.to(device)
|
||||
if not self.usegpu and not self.breakmodel:
|
||||
gen_in = gen_in.to("cpu")
|
||||
else:
|
||||
device = self.get_auxilary_device()
|
||||
gen_in = gen_in.to(device)
|
||||
|
||||
additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else []
|
||||
|
||||
@@ -254,8 +332,7 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
len(prompt_tokens) + max_new, utils.koboldai_vars.max_length
|
||||
),
|
||||
repetition_penalty=1.0,
|
||||
bad_words_ids=self.badwordsids
|
||||
+ additional_bad_words_ids,
|
||||
bad_words_ids=self.badwordsids + additional_bad_words_ids,
|
||||
use_cache=True,
|
||||
num_return_sequences=batch_count,
|
||||
)
|
||||
@@ -275,6 +352,9 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
tf_kwargs["revision"] = utils.koboldai_vars.revision
|
||||
tf_kwargs["cache_dir"] = "cache"
|
||||
|
||||
if self.lazy_load:
|
||||
tf_kwargs.pop("low_cpu_mem_usage", None)
|
||||
|
||||
# If we have model hints for legacy model, use them rather than fall back.
|
||||
try:
|
||||
if self.model_name == "GPT2Custom":
|
||||
@@ -283,10 +363,63 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
return GPTNeoForCausalLM.from_pretrained(location, **tf_kwargs)
|
||||
except Exception as e:
|
||||
logger.warning(f"{self.model_name} is a no-go; {e} - Falling back to auto.")
|
||||
if utils.args.panic:
|
||||
raise
|
||||
|
||||
# Try to determine model type from either AutoModel or falling back to legacy
|
||||
try:
|
||||
return AutoModelForCausalLM.from_pretrained(location, **tf_kwargs)
|
||||
if self.lazy_load:
|
||||
with lazy_loader.use_lazy_load(dematerialized_modules=True):
|
||||
metamodel = AutoModelForCausalLM.from_config(self.model_config)
|
||||
if utils.args.cpu:
|
||||
cpu_map = {name: "cpu" for name in utils.layers_module_names}
|
||||
for name in utils.get_missing_module_names(
|
||||
metamodel, list(cpu_map.keys())
|
||||
):
|
||||
cpu_map[name] = "cpu"
|
||||
tf_kwargs["device_map"] = cpu_map
|
||||
else:
|
||||
tf_kwargs["device_map"] = self.breakmodel_config.get_device_map(
|
||||
metamodel
|
||||
)
|
||||
|
||||
try:
|
||||
# Try to load with the lazyloader first...
|
||||
with lazy_loader.use_lazy_load(
|
||||
enable=self.lazy_load,
|
||||
# DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!!
|
||||
dematerialized_modules=False,
|
||||
):
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
location,
|
||||
offload_folder="accelerate-disk-cache",
|
||||
torch_dtype=self._get_target_dtype(),
|
||||
**tf_kwargs,
|
||||
)
|
||||
except Exception as e:
|
||||
# ...but fall back to stock HF if lazyloader fails.
|
||||
if utils.args.panic:
|
||||
raise
|
||||
logger.error("Lazyloader failed, falling back to stock HF load. You may run out of RAM here. Details:")
|
||||
logger.error(e)
|
||||
logger.error(traceback.format_exc())
|
||||
logger.info("Falling back to stock HF load...")
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
location,
|
||||
offload_folder="accelerate-disk-cache",
|
||||
torch_dtype=self._get_target_dtype(),
|
||||
**tf_kwargs,
|
||||
)
|
||||
|
||||
if not self.lazy_load and not self.breakmodel:
|
||||
# We need to move the model to the desired device
|
||||
if (not self.usegpu) or torch.cuda.device_count() <= 0:
|
||||
model = model.to("cpu")
|
||||
else:
|
||||
model = model.to("cuda")
|
||||
|
||||
return model
|
||||
except Exception as e:
|
||||
traceback_string = traceback.format_exc().lower()
|
||||
|
||||
@@ -300,6 +433,9 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
logger.error("Invalid load key! Aborting.")
|
||||
raise
|
||||
|
||||
if utils.args.panic:
|
||||
raise
|
||||
|
||||
logger.warning(f"Fell back to GPT2LMHeadModel due to {e}")
|
||||
logger.debug(traceback.format_exc())
|
||||
|
||||
@@ -325,49 +461,6 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
|
||||
return True
|
||||
|
||||
def _move_to_devices(self) -> None:
|
||||
for key, value in self.model.state_dict().items():
|
||||
target_dtype = (
|
||||
torch.float32 if breakmodel.primary_device == "cpu" else torch.float16
|
||||
)
|
||||
if value.dtype is not target_dtype:
|
||||
accelerate.utils.set_module_tensor_to_device(
|
||||
self.model,
|
||||
tensor_name=key,
|
||||
device=torch.device(value.device),
|
||||
value=value,
|
||||
dtype=target_dtype,
|
||||
)
|
||||
|
||||
disk_blocks = breakmodel.disk_blocks
|
||||
gpu_blocks = breakmodel.gpu_blocks
|
||||
ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks)
|
||||
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
|
||||
device_map = {}
|
||||
|
||||
for name in utils.layers_module_names:
|
||||
layer = int(name.rsplit(".", 1)[1])
|
||||
device = (
|
||||
("disk" if layer < disk_blocks else "cpu")
|
||||
if layer < ram_blocks
|
||||
else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
|
||||
)
|
||||
device_map[name] = device
|
||||
|
||||
for name in utils.get_missing_module_names(self.model, list(device_map.keys())):
|
||||
device_map[name] = breakmodel.primary_device
|
||||
|
||||
breakmodel.dispatch_model_ex(
|
||||
self.model,
|
||||
device_map,
|
||||
main_device=breakmodel.primary_device,
|
||||
offload_buffers=True,
|
||||
offload_dir="accelerate-disk-cache",
|
||||
)
|
||||
|
||||
gc.collect()
|
||||
return
|
||||
|
||||
# Function to patch transformers to use our soft prompt
|
||||
def patch_embedding(self) -> None:
|
||||
if getattr(Embedding, "_koboldai_patch_causallm_model", None):
|
||||
@@ -409,404 +502,20 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
Embedding.__call__ = new_embedding_call
|
||||
Embedding._koboldai_patch_causallm_model = self.model
|
||||
|
||||
def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True):
|
||||
if not self.lazy_load:
|
||||
return
|
||||
|
||||
|
||||
disk_blocks = breakmodel.disk_blocks
|
||||
gpu_blocks = breakmodel.gpu_blocks
|
||||
ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
|
||||
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
|
||||
|
||||
def lazy_load_callback(
|
||||
model_dict: Dict[str, Union[lazy_loader.LazyTensor, torch.Tensor]],
|
||||
f,
|
||||
is_safetensors: bool = False,
|
||||
**_,
|
||||
):
|
||||
if lazy_load_callback.nested:
|
||||
return
|
||||
lazy_load_callback.nested = True
|
||||
|
||||
device_map: Dict[str, Union[str, int]] = {}
|
||||
|
||||
@functools.lru_cache(maxsize=None)
|
||||
def get_original_key(key) -> Optional[str]:
|
||||
try:
|
||||
key_candidates = [
|
||||
original_key
|
||||
for original_key in utils.module_names
|
||||
if original_key.endswith(key)
|
||||
]
|
||||
except ValueError:
|
||||
return key
|
||||
|
||||
if not key_candidates:
|
||||
logger.debug(f"!!! No key candidates for {key}")
|
||||
return None
|
||||
|
||||
return max(key_candidates, key=len)
|
||||
|
||||
for key, value in model_dict.items():
|
||||
original_key = get_original_key(key)
|
||||
|
||||
if not original_key:
|
||||
continue
|
||||
|
||||
if isinstance(value, lazy_loader.LazyTensor) and not any(
|
||||
original_key.startswith(n) for n in utils.layers_module_names
|
||||
):
|
||||
device_map[key] = (
|
||||
utils.koboldai_vars.gpu_device
|
||||
if utils.koboldai_vars.hascuda and self.usegpu
|
||||
else "cpu"
|
||||
if not utils.koboldai_vars.hascuda
|
||||
or not self.breakmodel
|
||||
else breakmodel.primary_device
|
||||
)
|
||||
else:
|
||||
layer = int(
|
||||
max(
|
||||
(
|
||||
n
|
||||
for n in utils.layers_module_names
|
||||
if original_key.startswith(n)
|
||||
),
|
||||
key=len,
|
||||
).rsplit(".", 1)[1]
|
||||
)
|
||||
device = (
|
||||
utils.koboldai_vars.gpu_device
|
||||
if utils.koboldai_vars.hascuda and self.usegpu
|
||||
else "disk"
|
||||
if layer < disk_blocks and layer < ram_blocks
|
||||
else "cpu"
|
||||
if not utils.koboldai_vars.hascuda
|
||||
or not self.breakmodel
|
||||
else "shared"
|
||||
if layer < ram_blocks
|
||||
else bisect.bisect_right(
|
||||
cumulative_gpu_blocks, layer - ram_blocks
|
||||
)
|
||||
)
|
||||
device_map[key] = device
|
||||
|
||||
if utils.num_shards is None or utils.current_shard == 0:
|
||||
utils.offload_index = {}
|
||||
if os.path.isdir("accelerate-disk-cache"):
|
||||
# Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder
|
||||
# (the folder doesn't contain any subfolders so os.remove will do just fine)
|
||||
for filename in os.listdir("accelerate-disk-cache"):
|
||||
try:
|
||||
os.remove(os.path.join("accelerate-disk-cache", filename))
|
||||
except OSError:
|
||||
pass
|
||||
os.makedirs("accelerate-disk-cache", exist_ok=True)
|
||||
if utils.num_shards is not None:
|
||||
num_tensors = len(
|
||||
utils.get_sharded_checkpoint_num_tensors(
|
||||
utils.from_pretrained_model_name,
|
||||
utils.from_pretrained_index_filename,
|
||||
is_safetensors=is_safetensors,
|
||||
**utils.from_pretrained_kwargs,
|
||||
)
|
||||
)
|
||||
else:
|
||||
num_tensors = len(device_map)
|
||||
print(flush=True)
|
||||
utils.koboldai_vars.status_message = "Loading model"
|
||||
utils.koboldai_vars.total_layers = num_tensors
|
||||
utils.koboldai_vars.loaded_layers = 0
|
||||
utils.bar = tqdm(
|
||||
total=num_tensors,
|
||||
desc="Loading model tensors",
|
||||
file=utils.UIProgressBarFile(),
|
||||
position=1
|
||||
)
|
||||
|
||||
if not is_safetensors:
|
||||
# Torch lazyload
|
||||
with zipfile.ZipFile(f, "r") as z:
|
||||
try:
|
||||
last_storage_key = None
|
||||
zipfolder = os.path.basename(os.path.normpath(f)).split(".")[0]
|
||||
f = None
|
||||
current_offset = 0
|
||||
able_to_pin_layers = True
|
||||
if utils.num_shards is not None:
|
||||
utils.current_shard += 1
|
||||
for key in sorted(
|
||||
device_map.keys(),
|
||||
key=lambda k: (
|
||||
model_dict[k].key,
|
||||
model_dict[k].seek_offset,
|
||||
),
|
||||
):
|
||||
storage_key = model_dict[key].key
|
||||
if (
|
||||
storage_key != last_storage_key
|
||||
or model_dict[key].seek_offset < current_offset
|
||||
):
|
||||
last_storage_key = storage_key
|
||||
if isinstance(f, zipfile.ZipExtFile):
|
||||
f.close()
|
||||
ziproot = z.namelist()[0].split("/")[0]
|
||||
f = z.open(f"{ziproot}/data/{storage_key}")
|
||||
|
||||
current_offset = 0
|
||||
if current_offset != model_dict[key].seek_offset:
|
||||
f.read(model_dict[key].seek_offset - current_offset)
|
||||
current_offset = model_dict[key].seek_offset
|
||||
device = device_map[key]
|
||||
size = functools.reduce(
|
||||
lambda x, y: x * y, model_dict[key].shape, 1
|
||||
)
|
||||
dtype = model_dict[key].dtype
|
||||
nbytes = (
|
||||
size
|
||||
if dtype is torch.bool
|
||||
else size
|
||||
* (
|
||||
(
|
||||
torch.finfo
|
||||
if dtype.is_floating_point
|
||||
else torch.iinfo
|
||||
)(dtype).bits
|
||||
>> 3
|
||||
)
|
||||
)
|
||||
# print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
|
||||
#logger.debug(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ")
|
||||
model_dict[key] = model_dict[key].materialize(
|
||||
f, map_location="cpu"
|
||||
)
|
||||
if model_dict[key].dtype is torch.float32:
|
||||
utils.koboldai_vars.fp32_model = True
|
||||
if (
|
||||
convert_to_float16
|
||||
and breakmodel.primary_device != "cpu"
|
||||
and utils.koboldai_vars.hascuda
|
||||
and (
|
||||
self.breakmodel
|
||||
or self.usegpu
|
||||
)
|
||||
and model_dict[key].dtype is torch.float32
|
||||
):
|
||||
model_dict[key] = model_dict[key].to(torch.float16)
|
||||
if breakmodel.primary_device == "cpu" or (
|
||||
not self.usegpu
|
||||
and not self.breakmodel
|
||||
and model_dict[key].dtype is torch.float16
|
||||
):
|
||||
model_dict[key] = model_dict[key].to(torch.float32)
|
||||
if device == "shared":
|
||||
model_dict[key] = model_dict[key].to("cpu").detach_()
|
||||
if able_to_pin_layers:
|
||||
try:
|
||||
model_dict[key] = model_dict[key].pin_memory()
|
||||
except:
|
||||
able_to_pin_layers = False
|
||||
elif device == "disk":
|
||||
accelerate.utils.offload_weight(
|
||||
model_dict[key],
|
||||
get_original_key(key),
|
||||
"accelerate-disk-cache",
|
||||
index=utils.offload_index,
|
||||
)
|
||||
model_dict[key] = model_dict[key].to("meta")
|
||||
else:
|
||||
model_dict[key] = model_dict[key].to(device)
|
||||
# print("OK", flush=True)
|
||||
current_offset += nbytes
|
||||
utils.bar.update(1)
|
||||
utils.koboldai_vars.loaded_layers += 1
|
||||
finally:
|
||||
if (
|
||||
utils.num_shards is None
|
||||
or utils.current_shard >= utils.num_shards
|
||||
):
|
||||
if utils.offload_index:
|
||||
for name, tensor in utils.named_buffers:
|
||||
dtype = tensor.dtype
|
||||
if (
|
||||
convert_to_float16
|
||||
and breakmodel.primary_device != "cpu"
|
||||
and utils.koboldai_vars.hascuda
|
||||
and (
|
||||
self.breakmodel
|
||||
or self.usegpu
|
||||
)
|
||||
):
|
||||
dtype = torch.float16
|
||||
if breakmodel.primary_device == "cpu" or (
|
||||
not self.usegpu
|
||||
and not self.breakmodel
|
||||
):
|
||||
dtype = torch.float32
|
||||
if (
|
||||
name in model_dict
|
||||
and model_dict[name].dtype is not dtype
|
||||
):
|
||||
model_dict[name] = model_dict[name].to(dtype)
|
||||
if tensor.dtype is not dtype:
|
||||
tensor = tensor.to(dtype)
|
||||
if name not in utils.offload_index:
|
||||
accelerate.utils.offload_weight(
|
||||
tensor,
|
||||
name,
|
||||
"accelerate-disk-cache",
|
||||
index=utils.offload_index,
|
||||
)
|
||||
accelerate.utils.save_offload_index(
|
||||
utils.offload_index, "accelerate-disk-cache"
|
||||
)
|
||||
utils.bar.close()
|
||||
utils.bar = None
|
||||
utils.koboldai_vars.status_message = ""
|
||||
lazy_load_callback.nested = False
|
||||
if isinstance(f, zipfile.ZipExtFile):
|
||||
f.close()
|
||||
else:
|
||||
# Loading with safetensors
|
||||
try:
|
||||
able_to_pin_layers = True
|
||||
|
||||
if utils.num_shards is not None:
|
||||
utils.current_shard += 1
|
||||
|
||||
for key in sorted(
|
||||
device_map.keys(),
|
||||
key=lambda k: model_dict[k].key,
|
||||
):
|
||||
storage_key = model_dict[key].key
|
||||
|
||||
device = device_map[key]
|
||||
|
||||
# print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
|
||||
|
||||
model_dict[key] = model_dict[key].materialize(
|
||||
f, map_location="cpu"
|
||||
)
|
||||
|
||||
if model_dict[key].dtype is torch.float32:
|
||||
utils.koboldai_vars.fp32_model = True
|
||||
|
||||
if (
|
||||
convert_to_float16
|
||||
and breakmodel.primary_device != "cpu"
|
||||
and utils.koboldai_vars.hascuda
|
||||
and (
|
||||
self.breakmodel
|
||||
or self.usegpu
|
||||
)
|
||||
and model_dict[key].dtype is torch.float32
|
||||
):
|
||||
model_dict[key] = model_dict[key].to(torch.float16)
|
||||
|
||||
if breakmodel.primary_device == "cpu" or (
|
||||
not self.usegpu
|
||||
and not self.breakmodel
|
||||
and model_dict[key].dtype is torch.float16
|
||||
):
|
||||
model_dict[key] = model_dict[key].to(torch.float32)
|
||||
|
||||
if device == "shared":
|
||||
model_dict[key] = model_dict[key].to("cpu").detach_()
|
||||
if able_to_pin_layers:
|
||||
try:
|
||||
model_dict[key] = model_dict[key].pin_memory()
|
||||
except:
|
||||
able_to_pin_layers = False
|
||||
elif device == "disk":
|
||||
accelerate.utils.offload_weight(
|
||||
model_dict[key],
|
||||
get_original_key(key),
|
||||
"accelerate-disk-cache",
|
||||
index=utils.offload_index,
|
||||
)
|
||||
model_dict[key] = model_dict[key].to("meta")
|
||||
else:
|
||||
model_dict[key] = model_dict[key].to(device)
|
||||
|
||||
utils.bar.update(1)
|
||||
utils.koboldai_vars.loaded_layers += 1
|
||||
|
||||
finally:
|
||||
if (
|
||||
utils.num_shards is None
|
||||
or utils.current_shard >= utils.num_shards
|
||||
):
|
||||
if utils.offload_index:
|
||||
for name, tensor in utils.named_buffers:
|
||||
dtype = tensor.dtype
|
||||
if (
|
||||
convert_to_float16
|
||||
and breakmodel.primary_device != "cpu"
|
||||
and utils.koboldai_vars.hascuda
|
||||
and (
|
||||
self.breakmodel
|
||||
or self.usegpu
|
||||
)
|
||||
):
|
||||
dtype = torch.float16
|
||||
if breakmodel.primary_device == "cpu" or (
|
||||
not self.usegpu
|
||||
and not self.breakmodel
|
||||
):
|
||||
dtype = torch.float32
|
||||
if (
|
||||
name in model_dict
|
||||
and model_dict[name].dtype is not dtype
|
||||
):
|
||||
model_dict[name] = model_dict[name].to(dtype)
|
||||
if tensor.dtype is not dtype:
|
||||
tensor = tensor.to(dtype)
|
||||
if name not in utils.offload_index:
|
||||
accelerate.utils.offload_weight(
|
||||
tensor,
|
||||
name,
|
||||
"accelerate-disk-cache",
|
||||
index=utils.offload_index,
|
||||
)
|
||||
accelerate.utils.save_offload_index(
|
||||
utils.offload_index, "accelerate-disk-cache"
|
||||
)
|
||||
utils.bar.close()
|
||||
utils.bar = None
|
||||
utils.koboldai_vars.status_message = ""
|
||||
|
||||
lazy_load_callback.nested = False
|
||||
|
||||
lazy_load_callback.nested = False
|
||||
return lazy_load_callback
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _maybe_use_float16(self, always_use: bool = False):
|
||||
if always_use or (
|
||||
utils.koboldai_vars.hascuda
|
||||
and self.low_mem
|
||||
and (self.usegpu or self.breakmodel)
|
||||
):
|
||||
original_dtype = torch.get_default_dtype()
|
||||
torch.set_default_dtype(torch.float16)
|
||||
yield True
|
||||
torch.set_default_dtype(original_dtype)
|
||||
else:
|
||||
yield False
|
||||
|
||||
def breakmodel_device_list(self, n_layers, primary=None, selected=None):
|
||||
# TODO: Find a better place for this or rework this
|
||||
|
||||
device_count = torch.cuda.device_count()
|
||||
if device_count < 2:
|
||||
primary = None
|
||||
|
||||
logger.debug("n_layers: {}".format(n_layers))
|
||||
logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks))
|
||||
gpu_blocks = breakmodel.gpu_blocks + (
|
||||
device_count - len(breakmodel.gpu_blocks)
|
||||
logger.debug("gpu blocks: {}".format(self.breakmodel_config.gpu_blocks))
|
||||
|
||||
gpu_blocks = self.breakmodel_config.gpu_blocks + (
|
||||
device_count - len(self.breakmodel_config.gpu_blocks)
|
||||
) * [0]
|
||||
|
||||
print(f"{Colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{Colors.END}")
|
||||
|
||||
for i in range(device_count):
|
||||
name = torch.cuda.get_device_name(i)
|
||||
if len(name) > 47:
|
||||
@@ -816,72 +525,83 @@ class HFTorchInferenceModel(HFInferenceModel):
|
||||
print(
|
||||
f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{Colors.END}"
|
||||
)
|
||||
|
||||
row_color = Colors.END
|
||||
sep_color = Colors.YELLOW
|
||||
print(
|
||||
f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}"
|
||||
f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {self.breakmodel_config.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}"
|
||||
)
|
||||
print(
|
||||
f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){Colors.END}"
|
||||
)
|
||||
|
||||
def breakmodel_device_config(self, config):
|
||||
# TODO: Find a better place for this or rework this
|
||||
|
||||
global breakmodel, generator
|
||||
import breakmodel
|
||||
|
||||
n_layers = utils.num_layers(config)
|
||||
|
||||
logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks))
|
||||
logger.debug(
|
||||
"gpu blocks before modification: {}".format(
|
||||
self.breakmodel_config.gpu_blocks
|
||||
)
|
||||
)
|
||||
|
||||
if utils.args.cpu:
|
||||
breakmodel.gpu_blocks = [0] * n_layers
|
||||
self.breakmodel_config.gpu_blocks = [0] * n_layers
|
||||
return
|
||||
|
||||
elif breakmodel.gpu_blocks == []:
|
||||
elif self.breakmodel_config.gpu_blocks == []:
|
||||
logger.info("Breakmodel not specified, assuming GPU 0")
|
||||
breakmodel.gpu_blocks = [n_layers]
|
||||
self.breakmodel_config.gpu_blocks = [n_layers]
|
||||
n_layers = 0
|
||||
|
||||
|
||||
else:
|
||||
s = n_layers
|
||||
for i in range(len(breakmodel.gpu_blocks)):
|
||||
if breakmodel.gpu_blocks[i] <= -1:
|
||||
breakmodel.gpu_blocks[i] = s
|
||||
for i in range(len(self.breakmodel_config.gpu_blocks)):
|
||||
if self.breakmodel_config.gpu_blocks[i] <= -1:
|
||||
self.breakmodel_config.gpu_blocks[i] = s
|
||||
break
|
||||
else:
|
||||
s -= breakmodel.gpu_blocks[i]
|
||||
assert sum(breakmodel.gpu_blocks) <= n_layers
|
||||
n_layers -= sum(breakmodel.gpu_blocks)
|
||||
if breakmodel.disk_blocks is not None:
|
||||
assert breakmodel.disk_blocks <= n_layers
|
||||
n_layers -= breakmodel.disk_blocks
|
||||
s -= self.breakmodel_config.gpu_blocks[i]
|
||||
assert sum(self.breakmodel_config.gpu_blocks) <= n_layers
|
||||
n_layers -= sum(self.breakmodel_config.gpu_blocks)
|
||||
if self.breakmodel_config.disk_blocks is not None:
|
||||
assert self.breakmodel_config.disk_blocks <= n_layers
|
||||
n_layers -= self.breakmodel_config.disk_blocks
|
||||
|
||||
logger.init_ok("Final device configuration:", status="Info")
|
||||
self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
|
||||
with open("settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w") as file:
|
||||
file.write("{}\n{}".format(",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks))
|
||||
self.breakmodel_device_list(
|
||||
n_layers, primary=self.breakmodel_config.primary_device
|
||||
)
|
||||
with open(
|
||||
"settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w"
|
||||
) as file:
|
||||
file.write(
|
||||
"{}\n{}".format(
|
||||
",".join(map(str, self.breakmodel_config.gpu_blocks)),
|
||||
self.breakmodel_config.disk_blocks,
|
||||
)
|
||||
)
|
||||
|
||||
# If all layers are on the same device, use the old GPU generation mode
|
||||
while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
|
||||
breakmodel.gpu_blocks.pop()
|
||||
while (
|
||||
len(self.breakmodel_config.gpu_blocks)
|
||||
and self.breakmodel_config.gpu_blocks[-1] == 0
|
||||
):
|
||||
self.breakmodel_config.gpu_blocks.pop()
|
||||
self.breakmodel = True
|
||||
if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (
|
||||
if len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[
|
||||
-1
|
||||
] in (
|
||||
-1,
|
||||
utils.num_layers(config),
|
||||
):
|
||||
logger.debug("All layers on same GPU. Breakmodel disabled")
|
||||
self.breakmodel = False
|
||||
self.usegpu = True
|
||||
utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1
|
||||
utils.koboldai_vars.gpu_device = len(self.breakmodel_config.gpu_blocks) - 1
|
||||
return
|
||||
|
||||
if not breakmodel.gpu_blocks:
|
||||
if not self.breakmodel_config.gpu_blocks:
|
||||
logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
|
||||
import breakmodel
|
||||
|
||||
breakmodel.primary_device = "cpu"
|
||||
self.breakmodel = False
|
||||
self.usegpu = False
|
||||
return
|
||||
|
Reference in New Issue
Block a user