Pull upstream changes, fix conflicts

This commit is contained in:
0cc4m
2023-07-15 23:01:52 +02:00
33 changed files with 3868 additions and 3147 deletions

View File

@@ -1,15 +1,13 @@
from __future__ import annotations
from dataclasses import dataclass
import gc
import os
import time
import bisect
import zipfile
import functools
import itertools
import traceback
import contextlib
from tqdm.auto import tqdm
from torch import nn
from typing import Dict, List, Optional, Union
import torch
@@ -39,19 +37,52 @@ from modeling.inference_model import (
use_core_manipulations,
)
try:
import breakmodel
import accelerate.utils
except ModuleNotFoundError as e:
if not utils.koboldai_vars.use_colab_tpu:
raise e
# When set to true, messages will appear in the console if samplers are not
# changing the scores. Keep in mind some samplers don't always change the
# scores for each token.
LOG_SAMPLER_NO_EFFECT = False
class BreakmodelConfig:
def __init__(self) -> None:
self.disk_blocks = 0
self.gpu_blocks = []
@property
def primary_device(self):
if utils.args.cpu:
return "cpu"
elif not sum(self.gpu_blocks):
# No blocks are on GPU
return "cpu"
elif torch.cuda.device_count() <= 0:
return "cpu"
for device_index, blocks in enumerate(self.gpu_blocks):
if blocks:
return device_index
return 0
def get_device_map(self, model: nn.Module) -> dict:
ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks)
cumulative_gpu_blocks = tuple(itertools.accumulate(self.gpu_blocks))
device_map = {}
for name in utils.layers_module_names:
layer = int(name.rsplit(".", 1)[1])
device = (
("disk" if layer < self.disk_blocks else "cpu")
if layer < ram_blocks
else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
)
device_map[name] = device
for name in utils.get_missing_module_names(model, list(device_map.keys())):
device_map[name] = self.primary_device
return device_map
class HFTorchInferenceModel(HFInferenceModel):
def __init__(self) -> None:
super().__init__()
@@ -79,6 +110,29 @@ class HFTorchInferenceModel(HFInferenceModel):
post_token_probs=True,
)
self._old_stopping_criteria = None
self.breakmodel_config = BreakmodelConfig()
def set_input_parameters(self, parameters):
ret = super().set_input_parameters(parameters)
# Hook onto input param setting for setting breakmodel stuff
if self.breakmodel:
self.breakmodel_config.gpu_blocks = self.layers
self.breakmodel_config.disk_blocks = self.disk_layers
return ret
def get_auxilary_device(self) -> Union[str, int, torch.device]:
return self.breakmodel_config.primary_device
def _get_target_dtype(self) -> Union[torch.float16, torch.float32]:
if self.breakmodel_config.primary_device == "cpu":
return torch.float32
elif utils.args.cpu:
return torch.float32
elif not self.usegpu and not self.breakmodel:
return torch.float32
return torch.float16
def _apply_warpers(
self, scores: torch.Tensor, input_ids: torch.Tensor
@@ -125,19 +179,7 @@ class HFTorchInferenceModel(HFInferenceModel):
else:
return "Unknown"
def get_auxilary_device(self):
"""Get device auxilary tensors like inputs should be stored on."""
# NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU.
if utils.koboldai_vars.hascuda and self.usegpu:
return utils.koboldai_vars.gpu_device
elif utils.koboldai_vars.hascuda and self.breakmodel:
import breakmodel
return breakmodel.primary_device
return "cpu"
def _post_load(m_self) -> None:
if not utils.koboldai_vars.model_type:
utils.koboldai_vars.model_type = m_self.get_model_type()
@@ -220,6 +262,40 @@ class HFTorchInferenceModel(HFInferenceModel):
new_sample.old_sample = transformers.GenerationMixin.sample
use_core_manipulations.sample = new_sample
# PEFT Loading. This MUST be done after all save_pretrained calls are
# finished on the main model.
if utils.args.peft:
from peft import PeftModel, PeftConfig
local_peft_dir = os.path.join(m_self.get_local_model_path(), "peft")
# Make PEFT dir if it doesn't exist
try:
os.makedirs(local_peft_dir)
except FileExistsError:
pass
peft_local_path = os.path.join(local_peft_dir, utils.args.peft.replace("/", "_"))
logger.debug(f"Loading PEFT '{utils.args.peft}', possible local path is '{peft_local_path}'.")
peft_installed_locally = True
possible_peft_locations = [peft_local_path, utils.args.peft]
for i, location in enumerate(possible_peft_locations):
try:
m_self.model = PeftModel.from_pretrained(m_self.model, location)
logger.debug(f"Loaded PEFT at '{location}'")
break
except ValueError:
peft_installed_locally = False
if i == len(possible_peft_locations) - 1:
raise RuntimeError(f"Unable to load PeftModel for given name '{utils.args.peft}'. Does it exist?")
except RuntimeError:
raise RuntimeError("Error while loading PeftModel. Are you using the correct model?")
if not peft_installed_locally:
logger.debug(f"PEFT not saved to models folder; saving to '{peft_local_path}'")
m_self.model.save_pretrained(peft_local_path)
return super()._post_load()
def _raw_generate(
@@ -236,9 +312,11 @@ class HFTorchInferenceModel(HFInferenceModel):
gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None]
else:
gen_in = prompt_tokens
device = self.get_auxilary_device()
gen_in = gen_in.to(device)
if not self.usegpu and not self.breakmodel:
gen_in = gen_in.to("cpu")
else:
device = self.get_auxilary_device()
gen_in = gen_in.to(device)
additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else []
@@ -254,8 +332,7 @@ class HFTorchInferenceModel(HFInferenceModel):
len(prompt_tokens) + max_new, utils.koboldai_vars.max_length
),
repetition_penalty=1.0,
bad_words_ids=self.badwordsids
+ additional_bad_words_ids,
bad_words_ids=self.badwordsids + additional_bad_words_ids,
use_cache=True,
num_return_sequences=batch_count,
)
@@ -275,6 +352,9 @@ class HFTorchInferenceModel(HFInferenceModel):
tf_kwargs["revision"] = utils.koboldai_vars.revision
tf_kwargs["cache_dir"] = "cache"
if self.lazy_load:
tf_kwargs.pop("low_cpu_mem_usage", None)
# If we have model hints for legacy model, use them rather than fall back.
try:
if self.model_name == "GPT2Custom":
@@ -283,10 +363,63 @@ class HFTorchInferenceModel(HFInferenceModel):
return GPTNeoForCausalLM.from_pretrained(location, **tf_kwargs)
except Exception as e:
logger.warning(f"{self.model_name} is a no-go; {e} - Falling back to auto.")
if utils.args.panic:
raise
# Try to determine model type from either AutoModel or falling back to legacy
try:
return AutoModelForCausalLM.from_pretrained(location, **tf_kwargs)
if self.lazy_load:
with lazy_loader.use_lazy_load(dematerialized_modules=True):
metamodel = AutoModelForCausalLM.from_config(self.model_config)
if utils.args.cpu:
cpu_map = {name: "cpu" for name in utils.layers_module_names}
for name in utils.get_missing_module_names(
metamodel, list(cpu_map.keys())
):
cpu_map[name] = "cpu"
tf_kwargs["device_map"] = cpu_map
else:
tf_kwargs["device_map"] = self.breakmodel_config.get_device_map(
metamodel
)
try:
# Try to load with the lazyloader first...
with lazy_loader.use_lazy_load(
enable=self.lazy_load,
# DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!!
dematerialized_modules=False,
):
model = AutoModelForCausalLM.from_pretrained(
location,
offload_folder="accelerate-disk-cache",
torch_dtype=self._get_target_dtype(),
**tf_kwargs,
)
except Exception as e:
# ...but fall back to stock HF if lazyloader fails.
if utils.args.panic:
raise
logger.error("Lazyloader failed, falling back to stock HF load. You may run out of RAM here. Details:")
logger.error(e)
logger.error(traceback.format_exc())
logger.info("Falling back to stock HF load...")
model = AutoModelForCausalLM.from_pretrained(
location,
offload_folder="accelerate-disk-cache",
torch_dtype=self._get_target_dtype(),
**tf_kwargs,
)
if not self.lazy_load and not self.breakmodel:
# We need to move the model to the desired device
if (not self.usegpu) or torch.cuda.device_count() <= 0:
model = model.to("cpu")
else:
model = model.to("cuda")
return model
except Exception as e:
traceback_string = traceback.format_exc().lower()
@@ -300,6 +433,9 @@ class HFTorchInferenceModel(HFInferenceModel):
logger.error("Invalid load key! Aborting.")
raise
if utils.args.panic:
raise
logger.warning(f"Fell back to GPT2LMHeadModel due to {e}")
logger.debug(traceback.format_exc())
@@ -325,49 +461,6 @@ class HFTorchInferenceModel(HFInferenceModel):
return True
def _move_to_devices(self) -> None:
for key, value in self.model.state_dict().items():
target_dtype = (
torch.float32 if breakmodel.primary_device == "cpu" else torch.float16
)
if value.dtype is not target_dtype:
accelerate.utils.set_module_tensor_to_device(
self.model,
tensor_name=key,
device=torch.device(value.device),
value=value,
dtype=target_dtype,
)
disk_blocks = breakmodel.disk_blocks
gpu_blocks = breakmodel.gpu_blocks
ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks)
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
device_map = {}
for name in utils.layers_module_names:
layer = int(name.rsplit(".", 1)[1])
device = (
("disk" if layer < disk_blocks else "cpu")
if layer < ram_blocks
else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
)
device_map[name] = device
for name in utils.get_missing_module_names(self.model, list(device_map.keys())):
device_map[name] = breakmodel.primary_device
breakmodel.dispatch_model_ex(
self.model,
device_map,
main_device=breakmodel.primary_device,
offload_buffers=True,
offload_dir="accelerate-disk-cache",
)
gc.collect()
return
# Function to patch transformers to use our soft prompt
def patch_embedding(self) -> None:
if getattr(Embedding, "_koboldai_patch_causallm_model", None):
@@ -409,404 +502,20 @@ class HFTorchInferenceModel(HFInferenceModel):
Embedding.__call__ = new_embedding_call
Embedding._koboldai_patch_causallm_model = self.model
def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True):
if not self.lazy_load:
return
disk_blocks = breakmodel.disk_blocks
gpu_blocks = breakmodel.gpu_blocks
ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
def lazy_load_callback(
model_dict: Dict[str, Union[lazy_loader.LazyTensor, torch.Tensor]],
f,
is_safetensors: bool = False,
**_,
):
if lazy_load_callback.nested:
return
lazy_load_callback.nested = True
device_map: Dict[str, Union[str, int]] = {}
@functools.lru_cache(maxsize=None)
def get_original_key(key) -> Optional[str]:
try:
key_candidates = [
original_key
for original_key in utils.module_names
if original_key.endswith(key)
]
except ValueError:
return key
if not key_candidates:
logger.debug(f"!!! No key candidates for {key}")
return None
return max(key_candidates, key=len)
for key, value in model_dict.items():
original_key = get_original_key(key)
if not original_key:
continue
if isinstance(value, lazy_loader.LazyTensor) and not any(
original_key.startswith(n) for n in utils.layers_module_names
):
device_map[key] = (
utils.koboldai_vars.gpu_device
if utils.koboldai_vars.hascuda and self.usegpu
else "cpu"
if not utils.koboldai_vars.hascuda
or not self.breakmodel
else breakmodel.primary_device
)
else:
layer = int(
max(
(
n
for n in utils.layers_module_names
if original_key.startswith(n)
),
key=len,
).rsplit(".", 1)[1]
)
device = (
utils.koboldai_vars.gpu_device
if utils.koboldai_vars.hascuda and self.usegpu
else "disk"
if layer < disk_blocks and layer < ram_blocks
else "cpu"
if not utils.koboldai_vars.hascuda
or not self.breakmodel
else "shared"
if layer < ram_blocks
else bisect.bisect_right(
cumulative_gpu_blocks, layer - ram_blocks
)
)
device_map[key] = device
if utils.num_shards is None or utils.current_shard == 0:
utils.offload_index = {}
if os.path.isdir("accelerate-disk-cache"):
# Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder
# (the folder doesn't contain any subfolders so os.remove will do just fine)
for filename in os.listdir("accelerate-disk-cache"):
try:
os.remove(os.path.join("accelerate-disk-cache", filename))
except OSError:
pass
os.makedirs("accelerate-disk-cache", exist_ok=True)
if utils.num_shards is not None:
num_tensors = len(
utils.get_sharded_checkpoint_num_tensors(
utils.from_pretrained_model_name,
utils.from_pretrained_index_filename,
is_safetensors=is_safetensors,
**utils.from_pretrained_kwargs,
)
)
else:
num_tensors = len(device_map)
print(flush=True)
utils.koboldai_vars.status_message = "Loading model"
utils.koboldai_vars.total_layers = num_tensors
utils.koboldai_vars.loaded_layers = 0
utils.bar = tqdm(
total=num_tensors,
desc="Loading model tensors",
file=utils.UIProgressBarFile(),
position=1
)
if not is_safetensors:
# Torch lazyload
with zipfile.ZipFile(f, "r") as z:
try:
last_storage_key = None
zipfolder = os.path.basename(os.path.normpath(f)).split(".")[0]
f = None
current_offset = 0
able_to_pin_layers = True
if utils.num_shards is not None:
utils.current_shard += 1
for key in sorted(
device_map.keys(),
key=lambda k: (
model_dict[k].key,
model_dict[k].seek_offset,
),
):
storage_key = model_dict[key].key
if (
storage_key != last_storage_key
or model_dict[key].seek_offset < current_offset
):
last_storage_key = storage_key
if isinstance(f, zipfile.ZipExtFile):
f.close()
ziproot = z.namelist()[0].split("/")[0]
f = z.open(f"{ziproot}/data/{storage_key}")
current_offset = 0
if current_offset != model_dict[key].seek_offset:
f.read(model_dict[key].seek_offset - current_offset)
current_offset = model_dict[key].seek_offset
device = device_map[key]
size = functools.reduce(
lambda x, y: x * y, model_dict[key].shape, 1
)
dtype = model_dict[key].dtype
nbytes = (
size
if dtype is torch.bool
else size
* (
(
torch.finfo
if dtype.is_floating_point
else torch.iinfo
)(dtype).bits
>> 3
)
)
# print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
#logger.debug(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ")
model_dict[key] = model_dict[key].materialize(
f, map_location="cpu"
)
if model_dict[key].dtype is torch.float32:
utils.koboldai_vars.fp32_model = True
if (
convert_to_float16
and breakmodel.primary_device != "cpu"
and utils.koboldai_vars.hascuda
and (
self.breakmodel
or self.usegpu
)
and model_dict[key].dtype is torch.float32
):
model_dict[key] = model_dict[key].to(torch.float16)
if breakmodel.primary_device == "cpu" or (
not self.usegpu
and not self.breakmodel
and model_dict[key].dtype is torch.float16
):
model_dict[key] = model_dict[key].to(torch.float32)
if device == "shared":
model_dict[key] = model_dict[key].to("cpu").detach_()
if able_to_pin_layers:
try:
model_dict[key] = model_dict[key].pin_memory()
except:
able_to_pin_layers = False
elif device == "disk":
accelerate.utils.offload_weight(
model_dict[key],
get_original_key(key),
"accelerate-disk-cache",
index=utils.offload_index,
)
model_dict[key] = model_dict[key].to("meta")
else:
model_dict[key] = model_dict[key].to(device)
# print("OK", flush=True)
current_offset += nbytes
utils.bar.update(1)
utils.koboldai_vars.loaded_layers += 1
finally:
if (
utils.num_shards is None
or utils.current_shard >= utils.num_shards
):
if utils.offload_index:
for name, tensor in utils.named_buffers:
dtype = tensor.dtype
if (
convert_to_float16
and breakmodel.primary_device != "cpu"
and utils.koboldai_vars.hascuda
and (
self.breakmodel
or self.usegpu
)
):
dtype = torch.float16
if breakmodel.primary_device == "cpu" or (
not self.usegpu
and not self.breakmodel
):
dtype = torch.float32
if (
name in model_dict
and model_dict[name].dtype is not dtype
):
model_dict[name] = model_dict[name].to(dtype)
if tensor.dtype is not dtype:
tensor = tensor.to(dtype)
if name not in utils.offload_index:
accelerate.utils.offload_weight(
tensor,
name,
"accelerate-disk-cache",
index=utils.offload_index,
)
accelerate.utils.save_offload_index(
utils.offload_index, "accelerate-disk-cache"
)
utils.bar.close()
utils.bar = None
utils.koboldai_vars.status_message = ""
lazy_load_callback.nested = False
if isinstance(f, zipfile.ZipExtFile):
f.close()
else:
# Loading with safetensors
try:
able_to_pin_layers = True
if utils.num_shards is not None:
utils.current_shard += 1
for key in sorted(
device_map.keys(),
key=lambda k: model_dict[k].key,
):
storage_key = model_dict[key].key
device = device_map[key]
# print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
model_dict[key] = model_dict[key].materialize(
f, map_location="cpu"
)
if model_dict[key].dtype is torch.float32:
utils.koboldai_vars.fp32_model = True
if (
convert_to_float16
and breakmodel.primary_device != "cpu"
and utils.koboldai_vars.hascuda
and (
self.breakmodel
or self.usegpu
)
and model_dict[key].dtype is torch.float32
):
model_dict[key] = model_dict[key].to(torch.float16)
if breakmodel.primary_device == "cpu" or (
not self.usegpu
and not self.breakmodel
and model_dict[key].dtype is torch.float16
):
model_dict[key] = model_dict[key].to(torch.float32)
if device == "shared":
model_dict[key] = model_dict[key].to("cpu").detach_()
if able_to_pin_layers:
try:
model_dict[key] = model_dict[key].pin_memory()
except:
able_to_pin_layers = False
elif device == "disk":
accelerate.utils.offload_weight(
model_dict[key],
get_original_key(key),
"accelerate-disk-cache",
index=utils.offload_index,
)
model_dict[key] = model_dict[key].to("meta")
else:
model_dict[key] = model_dict[key].to(device)
utils.bar.update(1)
utils.koboldai_vars.loaded_layers += 1
finally:
if (
utils.num_shards is None
or utils.current_shard >= utils.num_shards
):
if utils.offload_index:
for name, tensor in utils.named_buffers:
dtype = tensor.dtype
if (
convert_to_float16
and breakmodel.primary_device != "cpu"
and utils.koboldai_vars.hascuda
and (
self.breakmodel
or self.usegpu
)
):
dtype = torch.float16
if breakmodel.primary_device == "cpu" or (
not self.usegpu
and not self.breakmodel
):
dtype = torch.float32
if (
name in model_dict
and model_dict[name].dtype is not dtype
):
model_dict[name] = model_dict[name].to(dtype)
if tensor.dtype is not dtype:
tensor = tensor.to(dtype)
if name not in utils.offload_index:
accelerate.utils.offload_weight(
tensor,
name,
"accelerate-disk-cache",
index=utils.offload_index,
)
accelerate.utils.save_offload_index(
utils.offload_index, "accelerate-disk-cache"
)
utils.bar.close()
utils.bar = None
utils.koboldai_vars.status_message = ""
lazy_load_callback.nested = False
lazy_load_callback.nested = False
return lazy_load_callback
@contextlib.contextmanager
def _maybe_use_float16(self, always_use: bool = False):
if always_use or (
utils.koboldai_vars.hascuda
and self.low_mem
and (self.usegpu or self.breakmodel)
):
original_dtype = torch.get_default_dtype()
torch.set_default_dtype(torch.float16)
yield True
torch.set_default_dtype(original_dtype)
else:
yield False
def breakmodel_device_list(self, n_layers, primary=None, selected=None):
# TODO: Find a better place for this or rework this
device_count = torch.cuda.device_count()
if device_count < 2:
primary = None
logger.debug("n_layers: {}".format(n_layers))
logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks))
gpu_blocks = breakmodel.gpu_blocks + (
device_count - len(breakmodel.gpu_blocks)
logger.debug("gpu blocks: {}".format(self.breakmodel_config.gpu_blocks))
gpu_blocks = self.breakmodel_config.gpu_blocks + (
device_count - len(self.breakmodel_config.gpu_blocks)
) * [0]
print(f"{Colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{Colors.END}")
for i in range(device_count):
name = torch.cuda.get_device_name(i)
if len(name) > 47:
@@ -816,72 +525,83 @@ class HFTorchInferenceModel(HFInferenceModel):
print(
f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{Colors.END}"
)
row_color = Colors.END
sep_color = Colors.YELLOW
print(
f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}"
f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {self.breakmodel_config.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}"
)
print(
f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){Colors.END}"
)
def breakmodel_device_config(self, config):
# TODO: Find a better place for this or rework this
global breakmodel, generator
import breakmodel
n_layers = utils.num_layers(config)
logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks))
logger.debug(
"gpu blocks before modification: {}".format(
self.breakmodel_config.gpu_blocks
)
)
if utils.args.cpu:
breakmodel.gpu_blocks = [0] * n_layers
self.breakmodel_config.gpu_blocks = [0] * n_layers
return
elif breakmodel.gpu_blocks == []:
elif self.breakmodel_config.gpu_blocks == []:
logger.info("Breakmodel not specified, assuming GPU 0")
breakmodel.gpu_blocks = [n_layers]
self.breakmodel_config.gpu_blocks = [n_layers]
n_layers = 0
else:
s = n_layers
for i in range(len(breakmodel.gpu_blocks)):
if breakmodel.gpu_blocks[i] <= -1:
breakmodel.gpu_blocks[i] = s
for i in range(len(self.breakmodel_config.gpu_blocks)):
if self.breakmodel_config.gpu_blocks[i] <= -1:
self.breakmodel_config.gpu_blocks[i] = s
break
else:
s -= breakmodel.gpu_blocks[i]
assert sum(breakmodel.gpu_blocks) <= n_layers
n_layers -= sum(breakmodel.gpu_blocks)
if breakmodel.disk_blocks is not None:
assert breakmodel.disk_blocks <= n_layers
n_layers -= breakmodel.disk_blocks
s -= self.breakmodel_config.gpu_blocks[i]
assert sum(self.breakmodel_config.gpu_blocks) <= n_layers
n_layers -= sum(self.breakmodel_config.gpu_blocks)
if self.breakmodel_config.disk_blocks is not None:
assert self.breakmodel_config.disk_blocks <= n_layers
n_layers -= self.breakmodel_config.disk_blocks
logger.init_ok("Final device configuration:", status="Info")
self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device)
with open("settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w") as file:
file.write("{}\n{}".format(",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks))
self.breakmodel_device_list(
n_layers, primary=self.breakmodel_config.primary_device
)
with open(
"settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w"
) as file:
file.write(
"{}\n{}".format(
",".join(map(str, self.breakmodel_config.gpu_blocks)),
self.breakmodel_config.disk_blocks,
)
)
# If all layers are on the same device, use the old GPU generation mode
while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0:
breakmodel.gpu_blocks.pop()
while (
len(self.breakmodel_config.gpu_blocks)
and self.breakmodel_config.gpu_blocks[-1] == 0
):
self.breakmodel_config.gpu_blocks.pop()
self.breakmodel = True
if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in (
if len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[
-1
] in (
-1,
utils.num_layers(config),
):
logger.debug("All layers on same GPU. Breakmodel disabled")
self.breakmodel = False
self.usegpu = True
utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1
utils.koboldai_vars.gpu_device = len(self.breakmodel_config.gpu_blocks) - 1
return
if not breakmodel.gpu_blocks:
if not self.breakmodel_config.gpu_blocks:
logger.warning("Nothing assigned to a GPU, reverting to CPU only mode")
import breakmodel
breakmodel.primary_device = "cpu"
self.breakmodel = False
self.usegpu = False
return