prompt_tuner.py now uses lazy loader and accelerate

This commit is contained in:
vfbd 2022-08-22 19:29:20 -04:00
parent 584056b6d5
commit 3d5c83fc23
3 changed files with 424 additions and 27 deletions

View File

@ -429,6 +429,7 @@ def emit(*args, **kwargs):
return _emit(*args, **kwargs)
except AttributeError:
return socketio.emit(*args, **kwargs)
utils.emit = emit
# marshmallow/apispec setup
from apispec import APISpec
@ -1311,6 +1312,8 @@ def general_startup(override_args=None):
args = parser.parse_args(shlex.split(os.environ["KOBOLDAI_ARGS"]))
else:
args = parser.parse_args()
utils.args = args
if args.customsettings:
f = open (args.customsettings)
@ -1648,7 +1651,9 @@ def patch_transformers():
if not args.no_aria2:
utils.aria2_hook(pretrained_model_name_or_path, **kwargs)
return old_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
PreTrainedModel.from_pretrained = new_from_pretrained
if(not hasattr(PreTrainedModel, "_kai_patched")):
PreTrainedModel.from_pretrained = new_from_pretrained
PreTrainedModel._kai_patched = True
if(hasattr(modeling_utils, "get_checkpoint_shard_files")):
old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files
def new_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs):
@ -2490,7 +2495,9 @@ def load_model(use_gpu=True, gpu_layers=None, disk_layers=None, initial_load=Fal
if not args.no_aria2:
utils.aria2_hook(pretrained_model_name_or_path, **kwargs)
return old_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
PreTrainedModel.from_pretrained = new_from_pretrained
if(not hasattr(PreTrainedModel, "_kai_patched")):
PreTrainedModel.from_pretrained = new_from_pretrained
PreTrainedModel._kai_patched = True
if(hasattr(modeling_utils, "get_checkpoint_shard_files")):
old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files
def new_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs):

View File

@ -15,15 +15,249 @@ import base64
import pickle
import hashlib
import itertools
import functools
import bisect
import eventlet
import packaging
import gc
import time
from tqdm.auto import tqdm
import torch
import torch.nn.functional as F
from torch.nn import Embedding, CrossEntropyLoss
import transformers
from transformers import AutoTokenizer, GPT2TokenizerFast, AutoConfig
from transformers import __version__ as transformers_version
from transformers import AutoTokenizer, GPT2TokenizerFast, AutoConfig, AutoModelForCausalLM, GPTNeoForCausalLM, PreTrainedModel, modeling_utils, GPTNeoModel, GPTJModel
import accelerate
import accelerate.utils
from mkultra.tuning import GPTPromptTuningMixin, GPTNeoPromptTuningLM
from mkultra.soft_prompt import SoftPrompt
from typing import List, Optional, TextIO, Union
from typing import Dict, List, Optional, TextIO, Union
try:
from transformers import XGLMModel
except:
pass
try:
from transformers.models.opt.modeling_opt import OPTDecoder
except:
pass
import breakmodel
import torch_lazy_loader
import utils
USE_BREAKMODEL = True
class Send_to_socketio(object):
def write(self, bar):
print(bar, end="")
time.sleep(0.01)
try:
if utils.emit is not None:
utils.emit('from_server', {'cmd': 'model_load_status', 'data': bar.replace(" ", " ")}, broadcast=True)
except:
pass
def patch_transformers_download():
global transformers
import copy, requests, tqdm, time
class Send_to_socketio(object):
def write(self, bar):
bar = bar.replace("\r", "").replace("\n", "")
if bar != "":
try:
print(bar, end="\r")
if utils.emit is not None:
utils.emit('from_server', {'cmd': 'model_load_status', 'data': bar.replace(" ", " ")}, broadcast=True)
eventlet.sleep(seconds=0)
except:
pass
def http_get(
url: str,
temp_file: transformers.utils.hub.BinaryIO,
proxies=None,
resume_size=0,
headers: transformers.utils.hub.Optional[transformers.utils.hub.Dict[str, str]] = None,
file_name: transformers.utils.hub.Optional[str] = None,
):
"""
Download remote file. Do not gobble up errors.
"""
headers = copy.deepcopy(headers)
if resume_size > 0:
headers["Range"] = f"bytes={resume_size}-"
r = requests.get(url, stream=True, proxies=proxies, headers=headers)
transformers.utils.hub._raise_for_status(r)
content_length = r.headers.get("Content-Length")
total = resume_size + int(content_length) if content_length is not None else None
# `tqdm` behavior is determined by `utils.logging.is_progress_bar_enabled()`
# and can be set using `utils.logging.enable/disable_progress_bar()`
if url[-11:] != 'config.json':
progress = tqdm.tqdm(
unit="B",
unit_scale=True,
unit_divisor=1024,
total=total,
initial=resume_size,
desc=f"Downloading {file_name}" if file_name is not None else "Downloading",
file=Send_to_socketio(),
)
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
if url[-11:] != 'config.json':
progress.update(len(chunk))
temp_file.write(chunk)
if url[-11:] != 'config.json':
progress.close()
transformers.utils.hub.http_get = http_get
def patch_transformers():
global transformers
patch_transformers_download()
old_from_pretrained = PreTrainedModel.from_pretrained.__func__
@classmethod
def new_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
utils.num_shards = None
utils.current_shard = 0
utils.from_pretrained_model_name = pretrained_model_name_or_path
utils.from_pretrained_index_filename = None
utils.from_pretrained_kwargs = kwargs
utils.bar = None
if utils.args is None or not utils.args.no_aria2:
utils.aria2_hook(pretrained_model_name_or_path, **kwargs)
return old_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
if(not hasattr(PreTrainedModel, "_kai_patched")):
PreTrainedModel.from_pretrained = new_from_pretrained
PreTrainedModel._kai_patched = True
if(hasattr(modeling_utils, "get_checkpoint_shard_files")):
old_get_checkpoint_shard_files = modeling_utils.get_checkpoint_shard_files
def new_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs):
utils.num_shards = utils.get_num_shards(index_filename)
utils.from_pretrained_index_filename = index_filename
return old_get_checkpoint_shard_files(pretrained_model_name_or_path, index_filename, *args, **kwargs)
modeling_utils.get_checkpoint_shard_files = new_get_checkpoint_shard_files
# Some versions of transformers 4.17.0.dev0 are affected by
# https://github.com/huggingface/transformers/issues/15736
# This is a workaround for those versions of transformers.
if(transformers_version == "4.17.0.dev0"):
try:
from transformers.models.xglm.modeling_xglm import XGLMSinusoidalPositionalEmbedding
except ImportError:
pass
else:
@torch.no_grad()
def new_forward(self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0):
bsz, seq_len = inputs_embeds.size()[:-1]
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
past_key_values_length + self.padding_idx + 1, past_key_values_length + sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
).unsqueeze(0).expand(input_shape).contiguous()
max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
if max_pos > self.weights.size(0):
self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
XGLMSinusoidalPositionalEmbedding.forward = new_forward
# Fix a bug in OPTForCausalLM where self.lm_head is the wrong size
if(packaging.version.parse("4.19.0.dev0") <= packaging.version.parse(transformers_version) < packaging.version.parse("4.20.0")):
try:
from transformers import OPTForCausalLM, OPTModel
except ImportError:
pass
else:
# This is the same as the original __init__ but with
# config.hidden_size
# replaced with
# config.word_embed_proj_dim
def new_init(self, config):
super(OPTForCausalLM, self).__init__(config)
self.model = OPTModel(config)
self.lm_head = torch.nn.Linear(config.word_embed_proj_dim, config.vocab_size, bias=False)
self.post_init()
OPTForCausalLM.__init__ = new_init
def move_model_to_devices(model, usegpu, gpu_device):
global generator
if(not utils.HAS_ACCELERATE and not USE_BREAKMODEL):
if(usegpu):
model = model.half().to(gpu_device)
else:
model = model.to('cpu').float()
generator = model.generate
return
import breakmodel
if(utils.HAS_ACCELERATE):
import accelerate.utils
for key, value in model.state_dict().items():
target_dtype = torch.float32 if breakmodel.primary_device == "cpu" else torch.float16
if(value.dtype is not target_dtype):
accelerate.utils.set_module_tensor_to_device(model, key, target_dtype)
disk_blocks = breakmodel.disk_blocks
gpu_blocks = breakmodel.gpu_blocks
ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks)
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
device_map = {}
for name in utils.layers_module_names:
layer = int(name.rsplit(".", 1)[1])
device = ("disk" if layer < disk_blocks else "cpu") if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
device_map[name] = device
for name in utils.get_missing_module_names(model, list(device_map.keys())):
device_map[name] = breakmodel.primary_device
breakmodel.dispatch_model_ex(model, device_map, main_device=breakmodel.primary_device, offload_buffers=True, offload_dir="accelerate-disk-cache")
gc.collect()
generator = model.generate
return
model.half()
gc.collect()
if(hasattr(model, "transformer")):
model.transformer.wte.to(breakmodel.primary_device)
model.transformer.ln_f.to(breakmodel.primary_device)
if(hasattr(model, 'lm_head')):
model.lm_head.to(breakmodel.primary_device)
if(hasattr(model.transformer, 'wpe')):
model.transformer.wpe.to(breakmodel.primary_device)
elif(not hasattr(model.model, "decoder")):
model.model.embed_tokens.to(breakmodel.primary_device)
model.model.layer_norm.to(breakmodel.primary_device)
model.lm_head.to(breakmodel.primary_device)
model.model.embed_positions.to(breakmodel.primary_device)
else:
model.model.decoder.embed_tokens.to(breakmodel.primary_device)
if(model.model.decoder.project_in is not None):
model.model.decoder.project_in.to(breakmodel.primary_device)
if(model.model.decoder.project_out is not None):
model.model.decoder.project_out.to(breakmodel.primary_device)
model.model.decoder.embed_positions.to(breakmodel.primary_device)
gc.collect()
GPTNeoModel.forward = breakmodel.new_forward_neo
if("GPTJModel" in globals()):
GPTJModel.forward = breakmodel.new_forward_neo # type: ignore
if("XGLMModel" in globals()):
XGLMModel.forward = breakmodel.new_forward_xglm # type: ignore
if("OPTDecoder" in globals()):
OPTDecoder.forward = breakmodel.new_forward_opt # type: ignore
generator = model.generate
if(hasattr(model, "transformer")):
breakmodel.move_hidden_layers(model.transformer)
elif(not hasattr(model.model, "decoder")):
breakmodel.move_hidden_layers(model.model, model.model.layers)
else:
breakmodel.move_hidden_layers(model.model.decoder, model.model.decoder.layers)
_PromptTuningPreTrainedModel = Union["UniversalPromptTuningMixin", GPTPromptTuningMixin, transformers.PreTrainedModel]
@ -259,16 +493,20 @@ class TrainerBase(abc.ABC):
if "quiet" not in kwargs:
kwargs["quiet"] = self.quiet
raise ConfigurationError(msg, **kwargs)
def get_hf_checkpoint_metadata(self) -> bool:
def _get_model_config(self) -> transformers.configuration_utils.PretrainedConfig:
REVISION = None
params = {}
if(os.path.isdir(self.data.ckpt_path)):
model_config = AutoConfig.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
elif(os.path.isdir("models/{}".format(self.data.ckpt_path.replace('/', '_')))):
model_config = AutoConfig.from_pretrained("models/{}".format(self.data.ckpt_path.replace('/', '_')), revision=REVISION, cache_dir="cache")
else:
model_config = AutoConfig.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
return model_config
def get_hf_checkpoint_metadata(self) -> bool:
params = {}
model_config = self._get_model_config()
params["tokenizer_id"] = self.data.ckpt_path
tokenizer = get_tokenizer(self.data.ckpt_path)
params["newlinemode"] = params.get(
@ -467,7 +705,17 @@ class TrainerBase(abc.ABC):
if isinstance(output_file, str):
f.close()
def train(self):
def train(
self,
breakmodel_primary_device: Optional[Union[str, int, torch.device]] = None,
breakmodel_gpulayers: Optional[List[int]] = None,
breakmodel_disklayers = 0,
):
if breakmodel_gpulayers is None:
breakmodel_gpulayers = []
if breakmodel_primary_device is None:
breakmodel_primary_device = 0 if breakmodel_gpulayers else "cpu"
if self.data.params is not None and "max_batch_size" not in self.data.params:
self.data.params["max_batch_size"] = 2048
@ -498,30 +746,169 @@ class TrainerBase(abc.ABC):
REVISION = None
tokenizer = self.get_tokenizer()
patch_transformers()
model: _PromptTuningPreTrainedModel
if(os.path.isdir(self.data.ckpt_path)):
model_config = self._get_model_config()
n_layers = utils.num_layers(model_config)
convert_to_float16 = True
hascuda = torch.cuda.is_available()
usegpu = not breakmodel_disklayers and len(breakmodel_gpulayers) == 1 and breakmodel_gpulayers[0] == n_layers
gpu_device = breakmodel_primary_device
breakmodel.disk_blocks = breakmodel_disklayers
disk_blocks = breakmodel.disk_blocks
gpu_blocks = breakmodel.gpu_blocks
ram_blocks = ram_blocks = n_layers - sum(gpu_blocks)
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
def lazy_load_callback(model_dict: Dict[str, Union[torch_lazy_loader.LazyTensor, torch.Tensor]], f, **_):
if lazy_load_callback.nested:
return
lazy_load_callback.nested = True
device_map: Dict[str, Union[str, int]] = {}
@functools.lru_cache(maxsize=None)
def get_original_key(key):
return max((original_key for original_key in utils.module_names if original_key.endswith(key)), key=len)
for key, value in model_dict.items():
original_key = get_original_key(key)
if isinstance(value, torch_lazy_loader.LazyTensor) and not any(original_key.startswith(n) for n in utils.layers_module_names):
device_map[key] = gpu_device if hascuda and usegpu else "cpu" if not hascuda or not USE_BREAKMODEL else breakmodel.primary_device
else:
layer = int(max((n for n in utils.layers_module_names if original_key.startswith(n)), key=len).rsplit(".", 1)[1])
device = gpu_device if hascuda and usegpu else "disk" if layer < disk_blocks and layer < ram_blocks else "cpu" if not hascuda or not USE_BREAKMODEL else "shared" if layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks)
device_map[key] = device
if utils.num_shards is None or utils.current_shard == 0:
utils.offload_index = {}
if utils.HAS_ACCELERATE:
if os.path.isdir("accelerate-disk-cache"):
# Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder
# (the folder doesn't contain any subfolders so os.remove will do just fine)
for filename in os.listdir("accelerate-disk-cache"):
try:
os.remove(os.path.join("accelerate-disk-cache", filename))
except OSError:
pass
os.makedirs("accelerate-disk-cache", exist_ok=True)
if utils.num_shards is not None:
num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs))
else:
num_tensors = len(device_map)
print(flush=True)
utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=Send_to_socketio())
with zipfile.ZipFile(f, "r") as z:
try:
last_storage_key = None
f = None
current_offset = 0
able_to_pin_layers = True
if utils.num_shards is not None:
utils.current_shard += 1
for key in sorted(device_map.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)):
storage_key = model_dict[key].key
if storage_key != last_storage_key or model_dict[key].seek_offset < current_offset:
last_storage_key = storage_key
if isinstance(f, zipfile.ZipExtFile):
f.close()
f = z.open(f"archive/data/{storage_key}")
current_offset = 0
if current_offset != model_dict[key].seek_offset:
f.read(model_dict[key].seek_offset - current_offset)
current_offset = model_dict[key].seek_offset
device = device_map[key]
size = functools.reduce(lambda x, y: x * y, model_dict[key].shape, 1)
dtype = model_dict[key].dtype
nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3)
#print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True)
model_dict[key] = model_dict[key].materialize(f, map_location="cpu")
# if model_dict[key].dtype is torch.float32:
# fp32_model = True
if convert_to_float16 and breakmodel.primary_device != "cpu" and hascuda and (USE_BREAKMODEL or usegpu) and model_dict[key].dtype is torch.float32:
model_dict[key] = model_dict[key].to(torch.float16)
if breakmodel.primary_device == "cpu" or (not usegpu and not USE_BREAKMODEL and model_dict[key].dtype is torch.float16):
model_dict[key] = model_dict[key].to(torch.float32)
if device == "shared":
model_dict[key] = model_dict[key].to("cpu").detach_()
if able_to_pin_layers and utils.HAS_ACCELERATE:
try:
model_dict[key] = model_dict[key].pin_memory()
except:
able_to_pin_layers = False
elif device == "disk":
accelerate.utils.offload_weight(model_dict[key], get_original_key(key), "accelerate-disk-cache", index=utils.offload_index)
model_dict[key] = model_dict[key].to("meta")
else:
model_dict[key] = model_dict[key].to(device)
#print("OK", flush=True)
current_offset += nbytes
utils.bar.update(1)
finally:
if utils.num_shards is None or utils.current_shard >= utils.num_shards:
if utils.offload_index:
for name, tensor in utils.named_buffers:
if name not in utils.offload_index:
accelerate.utils.offload_weight(tensor, name, "accelerate-disk-cache", index=utils.offload_index)
accelerate.utils.save_offload_index(utils.offload_index, "accelerate-disk-cache")
utils.bar.close()
utils.bar = None
lazy_load_callback.nested = False
if isinstance(f, zipfile.ZipExtFile):
f.close()
lazy_load_callback.nested = False
# Since we're using lazy loader, we need to figure out what the model's hidden layers are called
with torch_lazy_loader.use_lazy_torch_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True):
try:
model = AutoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
metamodel = AutoModelForCausalLM.from_config(model_config)
except Exception as e:
if("out of memory" in traceback.format_exc().lower()):
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
model = GPTNeoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
elif(os.path.isdir("models/{}".format(self.data.ckpt_path.replace('/', '_')))):
try:
model = AutoPromptTuningLM.from_pretrained("models/{}".format(self.data.ckpt_path.replace('/', '_')), revision=REVISION, cache_dir="cache")
except Exception as e:
if("out of memory" in traceback.format_exc().lower()):
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
model = GPTNeoPromptTuningLM.from_pretrained("models/{}".format(self.data.ckpt_path.replace('/', '_')), revision=REVISION, cache_dir="cache")
metamodel = GPTNeoForCausalLM.from_config(model_config)
utils.layers_module_names = utils.get_layers_module_names(metamodel)
utils.module_names = list(metamodel.state_dict().keys())
utils.named_buffers = list(metamodel.named_buffers(recurse=True))
with torch_lazy_loader.use_lazy_torch_load(callback=lazy_load_callback, dematerialized_modules=True):
if(os.path.isdir(self.data.ckpt_path)):
try:
model = AutoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
except Exception as e:
if("out of memory" in traceback.format_exc().lower()):
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
model = GPTNeoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
elif(os.path.isdir("models/{}".format(self.data.ckpt_path.replace('/', '_')))):
try:
model = AutoPromptTuningLM.from_pretrained("models/{}".format(self.data.ckpt_path.replace('/', '_')), revision=REVISION, cache_dir="cache")
except Exception as e:
if("out of memory" in traceback.format_exc().lower()):
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
model = GPTNeoPromptTuningLM.from_pretrained("models/{}".format(self.data.ckpt_path.replace('/', '_')), revision=REVISION, cache_dir="cache")
else:
try:
model = AutoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
except Exception as e:
if("out of memory" in traceback.format_exc().lower()):
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
model = GPTNeoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
if(hascuda):
if(usegpu):
model = model.half().to(gpu_device)
elif(breakmodel): # Use both RAM and VRAM (breakmodel)
move_model_to_devices(model, usegpu, gpu_device)
elif(__import__("breakmodel").disk_blocks > 0):
move_model_to_devices(model, usegpu, gpu_device)
else:
model = model.to('cpu').float()
elif(__import__("breakmodel").disk_blocks > 0):
move_model_to_devices(model, usegpu, gpu_device)
else:
try:
model = AutoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
except Exception as e:
if("out of memory" in traceback.format_exc().lower()):
raise RuntimeError("One of your GPUs ran out of memory when KoboldAI tried to load your model.")
model = GPTNeoPromptTuningLM.from_pretrained(self.data.ckpt_path, revision=REVISION, cache_dir="cache")
model.to('cpu').float()
if step == 0:
soft_embeddings = self.get_initial_soft_embeddings(model)

View File

@ -22,6 +22,7 @@ except ImportError:
HAS_ACCELERATE = False
vars = None
args = None
num_shards: Optional[int] = None
current_shard = 0
from_pretrained_model_name = ""
@ -35,6 +36,8 @@ named_buffers: Optional[List[tuple]] = None
default_sampler_order = [0, 1, 2, 3, 4, 5]
emit = None
#==================================================================#
# Decorator to prevent a function's actions from being run until
# at least x seconds have passed without the function being called