From 1546b9efaa95d7388b853d25ff3e8b756526f44e Mon Sep 17 00:00:00 2001 From: somebody Date: Sat, 27 May 2023 16:31:53 -0500 Subject: [PATCH 01/37] Hello its breaking breakmodel time --- aiserver.py | 3 + breakmodel.py | 955 ------------------ .../generic_hf_torch/class.py | 37 +- modeling/inference_models/hf.py | 6 +- modeling/inference_models/hf_torch.py | 148 +-- modeling/lazy_loader.py | 10 +- modeling/patches.py | 168 +++ utils.py | 6 +- 8 files changed, 236 insertions(+), 1097 deletions(-) delete mode 100644 breakmodel.py diff --git a/aiserver.py b/aiserver.py index faffe0e2..ad1efdab 100644 --- a/aiserver.py +++ b/aiserver.py @@ -30,6 +30,9 @@ logging.getLogger("urllib3").setLevel(logging.ERROR) import attention_bias attention_bias.do_patches() +from modeling import patches +patches.patch_transformers_for_lazyload() + from os import path, getcwd import time import re diff --git a/breakmodel.py b/breakmodel.py deleted file mode 100644 index 75bc03cc..00000000 --- a/breakmodel.py +++ /dev/null @@ -1,955 +0,0 @@ -''' -This is a MODIFIED version of arrmansa's low VRAM patch. -https://github.com/arrmansa/Basic-UI-for-GPT-J-6B-with-low-vram/blob/main/GPT-J-6B-Low-Vram-UI.ipynb -The ORIGINAL version of the patch is released under the Apache License 2.0 -Copyright 2021 arrmansa -Copyright 2021 finetuneanon -Copyright 2018, 2022 The Hugging Face team - - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -''' - - -import torch -from torch import nn -import torch.cuda.comm -import copy -import gc -import os -import sys -import itertools -import bisect -import random -import utils -from typing import Dict, List, Optional, Union - -from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPastAndCrossAttentions - -from transformers.utils import logging -logger = logging.get_logger(__name__) - - -breakmodel = True -gpu_blocks = [] -disk_blocks = 0 -primary_device = 0 if torch.cuda.device_count() > 0 else "cpu" - -from accelerate.hooks import attach_align_device_hook_on_blocks -from accelerate.utils import OffloadedWeightsLoader, check_device_map, extract_submodules_state_dict, offload_state_dict -from accelerate import dispatch_model - -def dispatch_model_ex( - model: nn.Module, - device_map: Dict[str, Union[str, int, torch.device]], - main_device: Optional[torch.device] = None, - state_dict: Optional[Dict[str, torch.Tensor]] = None, - offload_dir: Union[str, os.PathLike] = None, - offload_buffers: bool = False, - **kwargs, -): - """ - This is a modified version of - https://github.com/huggingface/accelerate/blob/eeaba598f455fbd2c48661d7e816d3ff25ab050b/src/accelerate/big_modeling.py#L130 - that still works when the main device is the CPU. - - Dispatches a model according to a given device map. Layers of the model might be spread across GPUs, offloaded on - the CPU or even the disk. - - Args: - model (`torch.nn.Module`): - The model to dispatch. - device_map (`Dict[str, Union[str, int, torch.device]]`): - A dictionary mapping module names in the models `state_dict` to the device they should go to. Note that - `"disk"` is accepted even if it's not a proper value for `torch.device`. - main_device (`str`, `int` or `torch.device`, *optional*): - The main execution device. Will default to the first device in the `device_map` different from `"cpu"` or - `"disk"`. - state_dict (`Dict[str, torch.Tensor]`, *optional*): - The state dict of the part of the model that will be kept on CPU. - offload_dir (`str` or `os.PathLike`): - The folder in which to offload the model weights (or where the model weights are already offloaded). - offload_buffers (`bool`, *optional*, defaults to `False`): - Whether or not to offload the buffers with the model parameters. - preload_module_classes (`List[str]`, *optional*): - A list of classes whose instances should load all their weights (even in the submodules) at the beginning - of the forward. This should only be used for classes that have submodules which are registered but not - called directly during the forward, for instance if a `dense` linear layer is registered, but at forward, - `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly. - """ - if main_device != "cpu": - return dispatch_model(model, device_map, main_device, state_dict, offload_dir=offload_dir, offload_buffers=offload_buffers, **kwargs) - - # Error early if the device map is incomplete. - check_device_map(model, device_map) - - offload_devices = ["cpu", "disk"] if main_device != "cpu" else ["disk"] - - if main_device is None: - main_device = [d for d in device_map.values() if d not in offload_devices][0] - - cpu_modules = [name for name, device in device_map.items() if device == "cpu"] if main_device != "cpu" else [] - if state_dict is None and len(cpu_modules) > 0: - state_dict = extract_submodules_state_dict(model.state_dict(), cpu_modules) - - disk_modules = [name for name, device in device_map.items() if device == "disk"] - if offload_dir is None and len(disk_modules) > 0: - raise ValueError( - "We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules " - f"need to be offloaded: {', '.join(disk_modules)}." - ) - if len(disk_modules) > 0 and ( - not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json")) - ): - disk_state_dict = extract_submodules_state_dict(model.state_dict(), disk_modules) - offload_state_dict(offload_dir, disk_state_dict) - - execution_device = { - name: main_device if device in offload_devices else device for name, device in device_map.items() - } - offload = {name: device in offload_devices for name, device in device_map.items()} - save_folder = offload_dir if len(disk_modules) > 0 else None - if state_dict is not None or save_folder is not None: - weights_map = OffloadedWeightsLoader(state_dict=state_dict, save_folder=save_folder) - else: - weights_map = None - - attach_align_device_hook_on_blocks( - model, - execution_device=execution_device, - offload=offload, - offload_buffers=offload_buffers, - weights_map=weights_map, - **kwargs, - ) - model.hf_device_map = device_map - return model - - -# Copied from transformers.models.bart.modeling_bart._expand_mask -def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): - """ - Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. - """ - bsz, src_len = mask.size() - tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) - - inverted_mask = 1.0 - expanded_mask - - return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) - - -def move_hidden_layers(transformer, h=None): - if h is None: - h = transformer.h - - assert len(gpu_blocks) <= torch.cuda.device_count() - assert sum(gpu_blocks) <= len(h) - ram_blocks = len(h) - sum(gpu_blocks) - - transformer.extrastorage = {} - torch.cuda.empty_cache() - - able_to_pin_layers = True - for i in range(ram_blocks): - h[i].to("cpu") - transformer.extrastorage[i] = copy.deepcopy(h[i]) - smalltensor = torch.tensor(0).to(primary_device) - for param1 in h[i].parameters(): - param1.data = smalltensor - h[i].to(primary_device) - for param in transformer.extrastorage[i].parameters(): - param.requires_grad = False - param.data = param.data.detach() - if able_to_pin_layers: - try: - param.data = param.data.pin_memory() - except: - able_to_pin_layers = False - print(f"WARNING: You only have enough shared GPU memory for {i} out of {ram_blocks} CPU layers. Expect suboptimal speed.", file=sys.stderr) - gc.collect() - torch.cuda.empty_cache() - - if ram_blocks: - for param1,param2 in zip(h[0].parameters(),transformer.extrastorage[0].parameters()): - param1.data = param2.data.to(primary_device, non_blocking=False).detach() - - for param1,param2 in zip(h[ram_blocks-1].parameters(),transformer.extrastorage[ram_blocks-1].parameters()): - param1.data = param2.data.to(primary_device, non_blocking=False).detach() - - i = ram_blocks - for j in range(len(gpu_blocks)): - for _ in range(gpu_blocks[j]): - h[i].to(j) - i += 1 - - -def new_forward_neo( - self, - input_ids=None, - past_key_values=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - embs=None, -): - assert len(gpu_blocks) <= torch.cuda.device_count() - assert sum(gpu_blocks) <= len(self.h) - ram_blocks = len(self.h) - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - batch_size = input_ids.shape[0] - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - batch_size = inputs_embeds.shape[0] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - device = input_ids.device if input_ids is not None else inputs_embeds.device - - if token_type_ids is not None: - token_type_ids = token_type_ids.view(-1, input_shape[-1]) - if position_ids is not None: - position_ids = position_ids.view(-1, input_shape[-1]) - - if past_key_values is None: - past_length = 0 - past_key_values = tuple([None] * len(self.h)) - else: - past_length = past_key_values[0][0].size(-2) - - device = primary_device if breakmodel else input_ids.device if input_ids is not None else inputs_embeds.device - if position_ids is None: - position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) - position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) - - # Attention mask. - if attention_mask is not None: - assert batch_size > 0, "batch_size has to be defined and > 0" - attention_mask = attention_mask.view(batch_size, -1) - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # this attention mask is more simple than the triangular masking of causal attention - # used in OpenAI GPT, we just need to prepare the broadcast dimension here. - attention_mask = attention_mask[:, None, None, :] - - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and -10000.0 for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility - attention_mask = (1.0 - attention_mask) * -10000.0 - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x num_heads x N x N - # head_mask has shape n_layer x batch x num_heads x N x N - head_mask = self.get_head_mask(head_mask, getattr(self.config, "num_layers", None) or self.config.n_layer) - - if inputs_embeds is None: - if breakmodel: - input_ids = input_ids.to(primary_device) - inputs_embeds = self.wte(input_ids) - - if embs is not None and not (use_cache is not None and use_cache and past_key_values is not None and len(past_key_values) > 0 and past_key_values[0] is not None): - offset = 0 - for pos, emb in embs: - pos += offset - if len(emb.shape) == 2: - emb = emb.repeat(input_shape[0], 1, 1) - inputs_embeds[:, pos:pos+emb.shape[1]] = emb - offset += emb.shape[1] - - if getattr(self, "wpe", None) is None: - hidden_states = inputs_embeds - else: - if breakmodel: - position_ids = position_ids.to(primary_device) - position_embeds = self.wpe(position_ids) - if breakmodel: - position_embeds = position_embeds.to(primary_device) - hidden_states = inputs_embeds + position_embeds - - if token_type_ids is not None: - token_type_embeds = self.wte(token_type_ids) - hidden_states = hidden_states + token_type_embeds - - hidden_states = self.drop(hidden_states) - - output_shape = input_shape + (hidden_states.size(-1),) - - presents = () if use_cache else None - all_self_attentions = () if output_attentions else None - all_hidden_states = () if output_hidden_states else None - - if breakmodel and ram_blocks: - copystream = torch.cuda.Stream(device=primary_device, priority=-1) - - for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - - if breakmodel: - if i in range(ram_blocks): - index1 = (i+1)%ram_blocks - for param1,param2 in zip(self.h[index1].parameters(),self.h[(i-1)%ram_blocks].parameters()): - param1.data = param2.data - for param1,param2 in zip(self.h[index1].parameters(),self.extrastorage[index1].parameters()): - with torch.cuda.stream(copystream): - torch.cuda.comm.broadcast(param2.data,out = [param1.data]) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states.cpu(),) - - if getattr(self.config, "gradient_checkpointing", False) and self.training: - - if use_cache: - logger.warning( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - def create_custom_forward(module): - def custom_forward(*inputs): - # None for past_key_value - return module(*inputs, use_cache, output_attentions) - - return custom_forward - - outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(block), - hidden_states, - None, - attention_mask, - head_mask[i], - ) - else: - if breakmodel: - device = primary_device if i < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, i - ram_blocks) - outputs = block( - hidden_states.to(device) if breakmodel and hidden_states is not None else hidden_states, - layer_past=tuple(v.to(device) for v in layer_past if v is not None) if breakmodel and layer_past is not None and i >= ram_blocks and len(layer_past) and layer_past[0].device.index != device else layer_past, - attention_mask=attention_mask.to(device) if breakmodel and attention_mask is not None else attention_mask, - head_mask=head_mask[i].to(device) if breakmodel and head_mask[i] is not None else head_mask[i], - use_cache=use_cache, - output_attentions=output_attentions, - ) - - hidden_states = outputs[0] - if use_cache is True: - presents = presents + (outputs[1],) - - if output_attentions: - all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) - - - if breakmodel: - if i in range(ram_blocks): - torch.cuda.synchronize() - torch.cuda.empty_cache() - - if breakmodel: - if ram_blocks: - del copystream - torch.cuda.empty_cache() - hidden_states = hidden_states.to(primary_device) - hidden_states = self.ln_f(hidden_states) - if breakmodel: - hidden_states = hidden_states.to(primary_device) - - hidden_states = hidden_states.view(*output_shape) - # Add last hidden state - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=presents, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - ) - - -def new_forward_xglm( - self, - input_ids=None, - attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - head_mask=None, - cross_attn_head_mask=None, - past_key_values=None, - inputs_embeds=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, -): - assert len(gpu_blocks) <= torch.cuda.device_count() - assert sum(gpu_blocks) <= len(self.layers) - ram_blocks = len(self.layers) - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - # past_key_values_length - past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 - - if inputs_embeds is None: - if breakmodel: - input_ids = input_ids.to(primary_device) - inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale - - attention_mask = self._prepare_decoder_attention_mask( - attention_mask, input_shape, inputs_embeds, past_key_values_length - ) - - # expand encoder attention mask - if encoder_hidden_states is not None and encoder_attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) - - # embed positions - if breakmodel: - inputs_embeds = inputs_embeds.to(primary_device) - positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length) - if breakmodel: - positions = positions.to(primary_device) - - hidden_states = inputs_embeds + positions - - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None - next_decoder_cache = () if use_cache else None - - if breakmodel and ram_blocks: - copystream = torch.cuda.Stream(device=primary_device, priority=-1) - - # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired - for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): - if attn_mask is not None: - assert attn_mask.size()[0] == ( - len(self.layers) - ), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." - for idx, decoder_layer in enumerate(self.layers): - i = idx - if breakmodel: - if i in range(ram_blocks): - index1 = (i+1)%ram_blocks - for param1,param2 in zip(self.layers[index1].parameters(),self.layers[(i-1)%ram_blocks].parameters()): - param1.data = param2.data - for param1,param2 in zip(self.layers[index1].parameters(),self.extrastorage[index1].parameters()): - with torch.cuda.stream(copystream): - torch.cuda.comm.broadcast(param2.data,out = [param1.data]) - - # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - if output_hidden_states: - all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) - if self.training and (dropout_probability < self.layerdrop): - continue - - past_key_value = past_key_values[idx] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - - if use_cache: - logger.warning( - "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`..." - ) - use_cache = False - - def create_custom_forward(module): - def custom_forward(*inputs): - # None for past_key_value - return module(*inputs, output_attentions, use_cache) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(decoder_layer), - hidden_states, - attention_mask, - encoder_hidden_states, - encoder_attention_mask, - head_mask[idx] if head_mask is not None else None, - cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, - None, - ) - else: - if breakmodel: - device = primary_device if i < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, i - ram_blocks) - layer_outputs = decoder_layer( - hidden_states.to(device) if breakmodel and hidden_states is not None else hidden_states, - attention_mask=attention_mask.to(device) if breakmodel and attention_mask is not None else attention_mask, - encoder_hidden_states=encoder_hidden_states.to(device) if breakmodel and encoder_hidden_states is not None else encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask.to(device) if breakmodel and encoder_attention_mask is not None else encoder_attention_mask, - layer_head_mask=((head_mask[idx].to(device) if breakmodel and head_mask[idx] is not None else head_mask[idx]) if head_mask is not None else None), - cross_attn_layer_head_mask=( - (cross_attn_head_mask[idx].to(device) if breakmodel and cross_attn_head_mask[idx] is not None else cross_attn_head_mask[idx]) if cross_attn_head_mask is not None else None - ), - past_key_value=tuple(v.to(device) for v in past_key_value if v is not None) if breakmodel and past_key_value is not None and i >= ram_blocks and len(past_key_value) and past_key_value[0].device.index != device else past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - if encoder_hidden_states is not None: - all_cross_attentions += (layer_outputs[2],) - - if breakmodel: - if i in range(ram_blocks): - torch.cuda.synchronize() - torch.cuda.empty_cache() - - if breakmodel: - if ram_blocks: - del copystream - torch.cuda.empty_cache() - hidden_states = hidden_states.to(primary_device) - hidden_states = self.layer_norm(hidden_states) - if breakmodel: - hidden_states = hidden_states.to(primary_device) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = next_decoder_cache if use_cache else None - if not return_dict: - return tuple( - v - for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] - if v is not None - ) - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - cross_attentions=all_cross_attentions, - ) - - -def new_forward_opt( - self, - input_ids=None, - attention_mask=None, - head_mask=None, - past_key_values=None, - inputs_embeds=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, -): - assert len(gpu_blocks) <= torch.cuda.device_count() - assert sum(gpu_blocks) <= len(self.layers) - ram_blocks = len(self.layers) - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 - - if inputs_embeds is None: - if breakmodel: - input_ids = input_ids.to(primary_device) - inputs_embeds = self.embed_tokens(input_ids) - - # embed positions - if breakmodel: - inputs_embeds = inputs_embeds.to(primary_device) - if attention_mask is None: - attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device) - - positions = self.embed_positions(attention_mask)[:, past_key_values_length:, :] - if breakmodel: - positions = positions.to(primary_device) - - attention_mask = self._prepare_decoder_attention_mask( - attention_mask, input_shape, inputs_embeds, past_key_values_length - ) - - if self.project_in is not None: - inputs_embeds = self.project_in(inputs_embeds) - - hidden_states = inputs_embeds + positions - - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = () if use_cache else None - - if breakmodel and ram_blocks: - copystream = torch.cuda.Stream(device=primary_device, priority=-1) - - # check if head_mask has a correct number of layers specified if desired - for attn_mask, mask_name in zip([head_mask], ["head_mask"]): - if attn_mask is not None: - if attn_mask.size()[0] != (len(self.layers)): - raise ValueError( - f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" - f" {head_mask.size()[0]}." - ) - - for idx, decoder_layer in enumerate(self.layers): - i = idx - if breakmodel: - if i in range(ram_blocks): - index1 = (i+1)%ram_blocks - for param1,param2 in zip(self.layers[index1].parameters(),self.layers[(i-1)%ram_blocks].parameters()): - param1.data = param2.data - for param1,param2 in zip(self.layers[index1].parameters(),self.extrastorage[index1].parameters()): - with torch.cuda.stream(copystream): - torch.cuda.comm.broadcast(param2.data,out = [param1.data]) - - # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - if output_hidden_states: - all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) - if self.training and (dropout_probability < self.layerdrop): - continue - - past_key_value = past_key_values[idx] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - - if use_cache: - logger.warning( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - def create_custom_forward(module): - def custom_forward(*inputs): - # None for past_key_value - return module(*inputs, output_attentions, None) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(decoder_layer), - hidden_states, - attention_mask, - head_mask[idx] if head_mask is not None else None, - None, - ) - else: - if breakmodel: - device = primary_device if i < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, i - ram_blocks) - layer_outputs = decoder_layer( - hidden_states.to(device) if breakmodel and hidden_states is not None else hidden_states, - attention_mask=attention_mask.to(device) if breakmodel and attention_mask is not None else attention_mask, - layer_head_mask=((head_mask[idx].to(device) if breakmodel and head_mask[idx] is not None else head_mask[idx]) if head_mask is not None else None), - past_key_value=tuple(v.to(device) for v in past_key_value if v is not None) if breakmodel and past_key_value is not None and i >= ram_blocks and len(past_key_value) and past_key_value[0].device.index != device else past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - if breakmodel: - if i in range(ram_blocks): - torch.cuda.synchronize() - torch.cuda.empty_cache() - - if breakmodel: - if ram_blocks: - del copystream - torch.cuda.empty_cache() - hidden_states = hidden_states.to(primary_device) - if self.project_out is not None: - hidden_states = self.project_out(hidden_states) - if breakmodel: - hidden_states = hidden_states.to(primary_device) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = next_decoder_cache if use_cache else None - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index fd4c2a1a..96d5f0c4 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -13,12 +13,6 @@ import modeling.lazy_loader as lazy_loader import koboldai_settings from logger import logger -try: - import breakmodel -except ModuleNotFoundError as e: - # Breakmodel is only expected to work on GPU - if not utils.koboldai_vars.use_colab_tpu: - raise e from modeling.inference_models.hf_torch import HFTorchInferenceModel @@ -70,14 +64,6 @@ class model_backend(HFTorchInferenceModel): # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel)) - if ( - self.lazy_load - and utils.koboldai_vars.hascuda - and self.breakmodel - and not self.nobreakmodel - ): - logger.debug("loading breakmodel") - self.breakmodel_device_config(self.model_config) if self.lazy_load: # If we're using lazy loader, we need to figure out what the model's hidden layers are called @@ -141,7 +127,7 @@ class model_backend(HFTorchInferenceModel): self.get_local_model_path(ignore_existance=True) ) - if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks: + if utils.koboldai_vars.fp32_model: # Use save_pretrained to convert fp32 models to fp16, # unless we are using disk cache because save_pretrained # is not supported in that case @@ -247,27 +233,6 @@ class model_backend(HFTorchInferenceModel): shutil.rmtree("cache/") self.patch_embedding() - - - if utils.koboldai_vars.hascuda: - if self.usegpu: - # Use just VRAM - self.model = self.model.half().to(utils.koboldai_vars.gpu_device) - elif self.breakmodel: - # Use both RAM and VRAM (breakmodel) - if not self.lazy_load: - self.breakmodel_device_config(self.model.config) - self._move_to_devices() - elif breakmodel.disk_blocks > 0: - # Use disk - self._move_to_devices() - else: - # Use CPU - self.model = self.model.to("cpu").float() - elif breakmodel.disk_blocks > 0: - self._move_to_devices() - else: - self.model = self.model.to("cpu").float() self.model.kai_model = self diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 4226d1b1..881ca604 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -157,7 +157,6 @@ class HFInferenceModel(InferenceModel): def set_input_parameters(self, parameters): if self.hf_torch and hasattr(self, "get_model_type") and self.get_model_type() != "gpt2": - import breakmodel layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None if layer_count is not None and layer_count >= 0 and not self.nobreakmodel: gpu_count = torch.cuda.device_count() @@ -176,9 +175,8 @@ class HFInferenceModel(InferenceModel): self.disk_layers = parameters['Disk_Layers'] if 'Disk_Layers' in parameters else 0 if isinstance(self.disk_layers, str): self.disk_layers = int(self.disk_layers) if self.disk_layers.isnumeric() else 0 - breakmodel.gpu_blocks = layers - breakmodel.disk_blocks = self.disk_layers - self.usegpu = self.cpu_layers == 0 and breakmodel.disk_blocks == 0 and sum(self.layers)-self.layers[0] == 0 + print("TODO: Allow config") + # self.usegpu = self.cpu_layers == 0 and breakmodel.disk_blocks == 0 and sum(self.layers)-self.layers[0] == 0 self.model_type = self.get_model_type() self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel self.lazy_load = True diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 2f575e73..abc3c54c 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -9,6 +9,7 @@ import functools import itertools import traceback import contextlib +from accelerate.utils.modeling import infer_auto_device_map, load_checkpoint_in_model from tqdm.auto import tqdm from typing import Dict, List, Optional, Union @@ -40,7 +41,6 @@ from modeling.inference_model import ( ) try: - import breakmodel import accelerate.utils except ModuleNotFoundError as e: if not utils.koboldai_vars.use_colab_tpu: @@ -125,17 +125,6 @@ class HFTorchInferenceModel(HFInferenceModel): else: return "Unknown" - def get_auxilary_device(self): - """Get device auxilary tensors like inputs should be stored on.""" - - # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU. - if utils.koboldai_vars.hascuda and self.usegpu: - return utils.koboldai_vars.gpu_device - elif utils.koboldai_vars.hascuda and self.breakmodel: - import breakmodel - return breakmodel.primary_device - return "cpu" - def _post_load(m_self) -> None: if not utils.koboldai_vars.model_type: @@ -237,7 +226,7 @@ class HFTorchInferenceModel(HFInferenceModel): else: gen_in = prompt_tokens - device = self.get_auxilary_device() + device = utils.get_auxilary_device() gen_in = gen_in.to(device) additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else [] @@ -254,8 +243,7 @@ class HFTorchInferenceModel(HFInferenceModel): len(prompt_tokens) + max_new, utils.koboldai_vars.max_length ), repetition_penalty=1.0, - bad_words_ids=self.badwordsids - + additional_bad_words_ids, + bad_words_ids=self.badwordsids + additional_bad_words_ids, use_cache=True, num_return_sequences=batch_count, ) @@ -286,7 +274,27 @@ class HFTorchInferenceModel(HFInferenceModel): # Try to determine model type from either AutoModel or falling back to legacy try: - return AutoModelForCausalLM.from_pretrained(location, **tf_kwargs) + model = AutoModelForCausalLM.from_config(self.model_config) + + # load_checkpoint_in_model( + # model.model, + # location, + # device_map=device_map + # offload_folder="accelerate-disk-cache", + # dtype="float16", + # offload_state_dict=True + # ) + # model.tie_weights() + + device_map = infer_auto_device_map( + model, + max_memory={0: "10GiB", 1: "7GiB", "cpu": "15GiB"}, + no_split_module_classes=["GPTJBlock"], + ) + + return AutoModelForCausalLM.from_pretrained( + location, device_map=device_map + ) # , **tf_kwargs) except Exception as e: traceback_string = traceback.format_exc().lower() @@ -325,49 +333,6 @@ class HFTorchInferenceModel(HFInferenceModel): return True - def _move_to_devices(self) -> None: - for key, value in self.model.state_dict().items(): - target_dtype = ( - torch.float32 if breakmodel.primary_device == "cpu" else torch.float16 - ) - if value.dtype is not target_dtype: - accelerate.utils.set_module_tensor_to_device( - self.model, - tensor_name=key, - device=torch.device(value.device), - value=value, - dtype=target_dtype, - ) - - disk_blocks = breakmodel.disk_blocks - gpu_blocks = breakmodel.gpu_blocks - ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - device_map = {} - - for name in utils.layers_module_names: - layer = int(name.rsplit(".", 1)[1]) - device = ( - ("disk" if layer < disk_blocks else "cpu") - if layer < ram_blocks - else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) - ) - device_map[name] = device - - for name in utils.get_missing_module_names(self.model, list(device_map.keys())): - device_map[name] = breakmodel.primary_device - - breakmodel.dispatch_model_ex( - self.model, - device_map, - main_device=breakmodel.primary_device, - offload_buffers=True, - offload_dir="accelerate-disk-cache", - ) - - gc.collect() - return - # Function to patch transformers to use our soft prompt def patch_embedding(self) -> None: if getattr(Embedding, "_koboldai_patch_causallm_model", None): @@ -413,11 +378,10 @@ class HFTorchInferenceModel(HFInferenceModel): if not self.lazy_load: return - - disk_blocks = breakmodel.disk_blocks - gpu_blocks = breakmodel.gpu_blocks - ram_blocks = ram_blocks = n_layers - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) + # disk_blocks = breakmodel.disk_blocks + # gpu_blocks = breakmodel.gpu_blocks + # ram_blocks = ram_blocks = n_layers - sum(gpu_blocks) + # cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) def lazy_load_callback( model_dict: Dict[str, Union[lazy_loader.LazyTensor, torch.Tensor]], @@ -428,6 +392,7 @@ class HFTorchInferenceModel(HFInferenceModel): if lazy_load_callback.nested: return lazy_load_callback.nested = True + return device_map: Dict[str, Union[str, int]] = {} @@ -458,8 +423,7 @@ class HFTorchInferenceModel(HFInferenceModel): utils.koboldai_vars.gpu_device if utils.koboldai_vars.hascuda and self.usegpu else "cpu" - if not utils.koboldai_vars.hascuda - or not self.breakmodel + if not utils.koboldai_vars.hascuda or not self.breakmodel else breakmodel.primary_device ) else: @@ -479,8 +443,7 @@ class HFTorchInferenceModel(HFInferenceModel): else "disk" if layer < disk_blocks and layer < ram_blocks else "cpu" - if not utils.koboldai_vars.hascuda - or not self.breakmodel + if not utils.koboldai_vars.hascuda or not self.breakmodel else "shared" if layer < ram_blocks else bisect.bisect_right( @@ -519,7 +482,7 @@ class HFTorchInferenceModel(HFInferenceModel): total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(), - position=1 + position=1, ) if not is_safetensors: @@ -550,7 +513,7 @@ class HFTorchInferenceModel(HFInferenceModel): f.close() ziproot = z.namelist()[0].split("/")[0] f = z.open(f"{ziproot}/data/{storage_key}") - + current_offset = 0 if current_offset != model_dict[key].seek_offset: f.read(model_dict[key].seek_offset - current_offset) @@ -574,7 +537,7 @@ class HFTorchInferenceModel(HFInferenceModel): ) ) # print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) - #logger.debug(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ") + # logger.debug(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ") model_dict[key] = model_dict[key].materialize( f, map_location="cpu" ) @@ -584,10 +547,7 @@ class HFTorchInferenceModel(HFInferenceModel): convert_to_float16 and breakmodel.primary_device != "cpu" and utils.koboldai_vars.hascuda - and ( - self.breakmodel - or self.usegpu - ) + and (self.breakmodel or self.usegpu) and model_dict[key].dtype is torch.float32 ): model_dict[key] = model_dict[key].to(torch.float16) @@ -630,15 +590,11 @@ class HFTorchInferenceModel(HFInferenceModel): convert_to_float16 and breakmodel.primary_device != "cpu" and utils.koboldai_vars.hascuda - and ( - self.breakmodel - or self.usegpu - ) + and (self.breakmodel or self.usegpu) ): dtype = torch.float16 if breakmodel.primary_device == "cpu" or ( - not self.usegpu - and not self.breakmodel + not self.usegpu and not self.breakmodel ): dtype = torch.float32 if ( @@ -693,10 +649,7 @@ class HFTorchInferenceModel(HFInferenceModel): convert_to_float16 and breakmodel.primary_device != "cpu" and utils.koboldai_vars.hascuda - and ( - self.breakmodel - or self.usegpu - ) + and (self.breakmodel or self.usegpu) and model_dict[key].dtype is torch.float32 ): model_dict[key] = model_dict[key].to(torch.float16) @@ -741,15 +694,11 @@ class HFTorchInferenceModel(HFInferenceModel): convert_to_float16 and breakmodel.primary_device != "cpu" and utils.koboldai_vars.hascuda - and ( - self.breakmodel - or self.usegpu - ) + and (self.breakmodel or self.usegpu) ): dtype = torch.float16 if breakmodel.primary_device == "cpu" or ( - not self.usegpu - and not self.breakmodel + not self.usegpu and not self.breakmodel ): dtype = torch.float32 if ( @@ -793,6 +742,7 @@ class HFTorchInferenceModel(HFInferenceModel): yield False def breakmodel_device_list(self, n_layers, primary=None, selected=None): + return # TODO: Find a better place for this or rework this device_count = torch.cuda.device_count() @@ -824,6 +774,7 @@ class HFTorchInferenceModel(HFInferenceModel): def breakmodel_device_config(self, config): # TODO: Find a better place for this or rework this + return global breakmodel, generator import breakmodel @@ -840,7 +791,7 @@ class HFTorchInferenceModel(HFInferenceModel): logger.info("Breakmodel not specified, assuming GPU 0") breakmodel.gpu_blocks = [n_layers] n_layers = 0 - + else: s = n_layers for i in range(len(breakmodel.gpu_blocks)): @@ -857,8 +808,14 @@ class HFTorchInferenceModel(HFInferenceModel): logger.init_ok("Final device configuration:", status="Info") self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device) - with open("settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w") as file: - file.write("{}\n{}".format(",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks)) + with open( + "settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w" + ) as file: + file.write( + "{}\n{}".format( + ",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks + ) + ) # If all layers are on the same device, use the old GPU generation mode while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0: @@ -876,9 +833,6 @@ class HFTorchInferenceModel(HFInferenceModel): if not breakmodel.gpu_blocks: logger.warning("Nothing assigned to a GPU, reverting to CPU only mode") - import breakmodel - - breakmodel.primary_device = "cpu" self.breakmodel = False self.usegpu = False return diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 5a27d549..a0f67d4a 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -101,6 +101,7 @@ class TorchLazyTensor(LazyTensor): stride: Optional[Tuple[int, ...]] = None, requires_grad=False, backward_hooks: Any = None, + file_handle: Any = None ): self.storage_type = storage_type self.key = key @@ -111,6 +112,7 @@ class TorchLazyTensor(LazyTensor): self.stride = stride self.requires_grad = requires_grad self.backward_hooks = backward_hooks + self.file_handle = file_handle def __view(self, f: Callable): return f"{type(self).__name__}(storage_type={f(self.storage_type)}, key={f(self.key)}, location={f(self.location)}, dtype={f(self.dtype)}, seek_offset={f(self.seek_offset)}, shape={f(self.shape)}, stride={f(self.stride)}, requires_grad={f(self.requires_grad)}, backward_hooks={f(self.backward_hooks)})" @@ -120,11 +122,13 @@ class TorchLazyTensor(LazyTensor): def materialize( self, - checkpoint: Union[zipfile.ZipFile, zipfile.ZipExtFile], + checkpoint: Union[zipfile.ZipFile, zipfile.ZipExtFile] = None, map_location=None, no_grad=True, filename="pytorch_model.bin", ) -> torch.Tensor: + checkpoint = checkpoint or self.file_handle + filename = os.path.basename(os.path.normpath(filename)).split(".")[0] size = reduce(lambda x, y: x * y, self.shape, 1) dtype = self.dtype @@ -237,6 +241,8 @@ class _LazyUnpickler(RestrictedUnpickler): lazy_loaded_storages: Dict[str, LazyTensor] def __init__(self, *args, **kwargs): + # print(args, kwargs) + self.file_handle = args[0] self.lazy_loaded_storages = {} return super().__init__(*args, **kwargs) @@ -247,7 +253,7 @@ class _LazyUnpickler(RestrictedUnpickler): typename == "storage" ), f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'" storage_type, key, location, _ = saved_id[1:] - return TorchLazyTensor(storage_type, key, location) + return TorchLazyTensor(storage_type, key, location, file_handle=self.file_handle) def load(self, *args, **kwargs): retval = super().load(*args, **kwargs) diff --git a/modeling/patches.py b/modeling/patches.py index f6b1ff0a..c8c070f8 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -10,6 +10,7 @@ from transformers import ( PreTrainedModel, modeling_utils, ) +from modeling.lazy_loader import LazyTensor import utils @@ -125,6 +126,173 @@ def patch_transformers_generation() -> None: transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ = new_init +CURRENT_CHECKPOINT = None +def patch_transformers_for_lazyload() -> None: + import torch + import inspect + from accelerate.utils import set_module_tensor_to_device, offload_weight + + def _load_state_dict_into_meta_model( + model, + state_dict, + loaded_state_dict_keys, # left for now but could be removed, see below + start_prefix, + expected_keys, + device_map=None, + offload_folder=None, + offload_index=None, + state_dict_folder=None, + state_dict_index=None, + dtype=None, + load_in_8bit=False, + is_safetensors=False, + keep_in_fp32_modules=None, + ): + """ + This is somewhat similar to `_load_state_dict_into_model`, but deals with a model that has some or all of its + params on a `meta` device. It replaces the model params with the data from the `state_dict`, while moving the + params back to the normal device, but only for `loaded_state_dict_keys`. + + `start_prefix` is used for models which insert their name into model keys, e.g. `bert` in + `bert.pooler.dense.weight` + + """ + + print("DEVMAP", device_map) + + # XXX: remaining features to implement to be fully compatible with _load_state_dict_into_model + # - deepspeed zero 3 support + # - need to copy metadata if any - see _load_state_dict_into_model + # - handling error_msgs - mimicking the error handling in module._load_from_state_dict() + # - Is there a situation where some keys aren't in `loaded_state_dict_keys` and in which case + # they won't get loaded. + + if load_in_8bit: + from .utils.bitsandbytes import set_module_8bit_tensor_to_device + + error_msgs = [] + + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if "gamma" in key: + new_key = key.replace("gamma", "weight") + if "beta" in key: + new_key = key.replace("beta", "bias") + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + + for param_name, param in state_dict.items(): + + # BEGIN PATCH + if isinstance(param, LazyTensor): + print("Materializing", param_name) + param = param.materialize() + # END PATCH + + # First part of the test is always true as load_state_dict_keys always contains state_dict keys. + if ( + param_name not in loaded_state_dict_keys + or param_name not in expected_keys + ): + continue + + if param_name.startswith(start_prefix): + param_name = param_name[len(start_prefix) :] + + module_name = param_name + set_module_kwargs = {} + + # We convert floating dtypes to the `dtype` passed. We want to keep the buffers/params + # in int/uint/bool and not cast them. + if dtype is not None and torch.is_floating_point(param): + if ( + keep_in_fp32_modules is not None + and any( + module_to_keep_in_fp32 in param_name + for module_to_keep_in_fp32 in keep_in_fp32_modules + ) + and dtype == torch.float16 + ): + param = param.to(torch.float32) + + # For backward compatibility with older versions of `accelerate` + # TODO: @sgugger replace this check with version check at the next `accelerate` release + if "dtype" in list( + inspect.signature(set_module_tensor_to_device).parameters + ): + set_module_kwargs["dtype"] = torch.float32 + else: + param = param.to(dtype) + + # For compatibility with PyTorch load_state_dict which converts state dict dtype to existing dtype in model + if dtype is None: + old_param = model + splits = param_name.split(".") + for split in splits: + old_param = getattr(old_param, split) + if old_param is None: + break + + if old_param is not None: + param = param.to(old_param.dtype) + + set_module_kwargs["value"] = param + + if device_map is None: + param_device = "cpu" + else: + # find next higher level module that is defined in device_map: + # bert.lm_head.weight -> bert.lm_head -> bert -> '' + while len(module_name) > 0 and module_name not in device_map: + module_name = ".".join(module_name.split(".")[:-1]) + if module_name == "" and "" not in device_map: + # TODO: group all errors and raise at the end. + raise ValueError(f"{param_name} doesn't have any device set.") + param_device = device_map[module_name] + if param_device == "disk": + if not is_safetensors: + offload_index = offload_weight( + param, param_name, offload_folder, offload_index + ) + elif param_device == "cpu" and state_dict_index is not None: + state_dict_index = offload_weight( + param, param_name, state_dict_folder, state_dict_index + ) + elif not load_in_8bit: + # For backward compatibility with older versions of `accelerate` + set_module_tensor_to_device( + model, param_name, param_device, **set_module_kwargs + ) + else: + if ( + param.dtype == torch.int8 + and param_name.replace("weight", "SCB") in state_dict.keys() + ): + fp16_statistics = state_dict[param_name.replace("weight", "SCB")] + else: + fp16_statistics = None + + if "SCB" not in param_name: + set_module_8bit_tensor_to_device( + model, + param_name, + param_device, + value=param, + fp16_statistics=fp16_statistics, + ) + + return error_msgs, offload_index, state_dict_index + + transformers.modeling_utils._load_state_dict_into_meta_model = ( + _load_state_dict_into_meta_model + ) + + def patch_transformers() -> None: patch_transformers_download() patch_transformers_loader() diff --git a/utils.py b/utils.py index 13ebb6a3..863bda2f 100644 --- a/utils.py +++ b/utils.py @@ -656,9 +656,9 @@ def get_auxilary_device(): # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU. if koboldai_vars.hascuda and koboldai_vars.usegpu: return koboldai_vars.gpu_device - elif koboldai_vars.hascuda and koboldai_vars.breakmodel: - import breakmodel - return breakmodel.primary_device + elif koboldai_vars.hascuda: + # TODO: Primary device + return "cuda" return "cpu" #==================================================================# From 6f93150e4d85057d77464f93c6a87a43f68cc35d Mon Sep 17 00:00:00 2001 From: somebody Date: Sun, 28 May 2023 12:25:31 -0500 Subject: [PATCH 02/37] Work on lazyload --- .../generic_hf_torch/class.py | 46 ++-- modeling/lazy_loader.py | 219 +++++------------- modeling/patches.py | 2 +- prompt_tuner.py | 2 +- 4 files changed, 97 insertions(+), 172 deletions(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 96d5f0c4..539d2018 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -18,11 +18,11 @@ from modeling.inference_models.hf_torch import HFTorchInferenceModel model_backend_name = "Huggingface" + class model_backend(HFTorchInferenceModel): - def _initialize_model(self): return - + def _load(self, save_model: bool, initial_load: bool) -> None: utils.koboldai_vars.allowsp = True @@ -34,9 +34,7 @@ class model_backend(HFTorchInferenceModel): # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model if self.model_name == "NeoCustom": - self.model_name = os.path.basename( - os.path.normpath(self.path) - ) + self.model_name = os.path.basename(os.path.normpath(self.path)) utils.koboldai_vars.model = self.model_name # If we specify a model and it's in the root directory, we need to move @@ -63,13 +61,18 @@ class model_backend(HFTorchInferenceModel): # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors - logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel)) + logger.debug( + "lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format( + self.lazy_load, + utils.koboldai_vars.hascuda, + self.breakmodel, + self.nobreakmodel, + ) + ) if self.lazy_load: # If we're using lazy loader, we need to figure out what the model's hidden layers are called - with lazy_loader.use_lazy_load( - dematerialized_modules=True, use_accelerate_init_empty_weights=True - ): + with lazy_loader.use_lazy_load(dematerialized_modules=True): try: metamodel = AutoModelForCausalLM.from_config(self.model_config) utils.layers_module_names = utils.get_layers_module_names(metamodel) @@ -195,7 +198,9 @@ class model_backend(HFTorchInferenceModel): pass if not any_success: - raise RuntimeError(f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'") + raise RuntimeError( + f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'" + ) else: # Handle saving sharded models @@ -233,11 +238,24 @@ class model_backend(HFTorchInferenceModel): shutil.rmtree("cache/") self.patch_embedding() - - + self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() def _save_settings(self): - with open("settings/{}.generic_hf_torch.model_backend.settings".format(self.model_name.replace("/", "_")), "w") as f: - json.dump({"layers": self.layers if 'layers' in vars(self) else [], "disk_layers": self.disk_layers if 'disk_layers' in vars(self) else 0}, f, indent="") \ No newline at end of file + with open( + "settings/{}.generic_hf_torch.model_backend.settings".format( + self.model_name.replace("/", "_") + ), + "w", + ) as f: + json.dump( + { + "layers": self.layers if "layers" in vars(self) else [], + "disk_layers": self.disk_layers + if "disk_layers" in vars(self) + else 0, + }, + f, + indent="", + ) diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index a0f67d4a..85ed495d 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -61,6 +61,7 @@ from typing import Any, Callable, Dict, Optional, Tuple, Type, Union # support it yet. try: import safetensors + HAS_SAFETENSORS = True except ModuleNotFoundError: HAS_SAFETENSORS = False @@ -68,9 +69,6 @@ except ModuleNotFoundError: import utils -_EXTRA_STATE_KEY_SUFFIX = "_extra_state" - - STORAGE_TYPE_MAP = { torch.float64: torch.DoubleStorage, torch.float32: torch.FloatStorage, @@ -84,6 +82,8 @@ STORAGE_TYPE_MAP = { torch.bfloat16: torch.BFloat16Storage, } +# Storage of zipfile handles for each shard +torch_checkpoint_file_handles = {} class LazyTensor: pass @@ -101,7 +101,6 @@ class TorchLazyTensor(LazyTensor): stride: Optional[Tuple[int, ...]] = None, requires_grad=False, backward_hooks: Any = None, - file_handle: Any = None ): self.storage_type = storage_type self.key = key @@ -112,7 +111,7 @@ class TorchLazyTensor(LazyTensor): self.stride = stride self.requires_grad = requires_grad self.backward_hooks = backward_hooks - self.file_handle = file_handle + self.file_name = None def __view(self, f: Callable): return f"{type(self).__name__}(storage_type={f(self.storage_type)}, key={f(self.key)}, location={f(self.location)}, dtype={f(self.dtype)}, seek_offset={f(self.seek_offset)}, shape={f(self.shape)}, stride={f(self.stride)}, requires_grad={f(self.requires_grad)}, backward_hooks={f(self.backward_hooks)})" @@ -127,7 +126,29 @@ class TorchLazyTensor(LazyTensor): no_grad=True, filename="pytorch_model.bin", ) -> torch.Tensor: - checkpoint = checkpoint or self.file_handle + + + + # if f not in torch_tensor_container_file_map: + # torch_tensor_container_file_map[f] = [] + + # with zipfile.ZipFile(f, "r") as z: + # paths = z.namelist() + + # for name in paths: + # val = name.split("/data/")[-1] + # if not val.isdecimal(): + # continue + # torch_tensor_container_file_map[f].append(int(val)) + # torch_tensor_container_file_map[f].sort() + # print(torch_tensor_container_file_map) + + + + + if not checkpoint: + checkpoint = torch_checkpoint_file_handles[self.file_name] + filename = self.file_name filename = os.path.basename(os.path.normpath(filename)).split(".")[0] size = reduce(lambda x, y: x * y, self.shape, 1) @@ -141,19 +162,23 @@ class TorchLazyTensor(LazyTensor): >> 3 ) ) + if isinstance(checkpoint, zipfile.ZipFile): try: f = checkpoint.open(f"archive/data/{self.key}", "r") except: f = checkpoint.open(f"{filename}/data/{self.key}", "r") - f.read(self.seek_offset) + f.seek(self.seek_offset, os.SEEK_CUR) + # f.read(self.seek_offset) else: f = checkpoint + try: storage = STORAGE_TYPE_MAP[dtype].from_buffer(f.read(nbytes), "little") finally: if isinstance(checkpoint, zipfile.ZipFile): f.close() + storage = torch.serialization._get_restore_location(map_location)( storage, self.location ) @@ -242,7 +267,6 @@ class _LazyUnpickler(RestrictedUnpickler): def __init__(self, *args, **kwargs): # print(args, kwargs) - self.file_handle = args[0] self.lazy_loaded_storages = {} return super().__init__(*args, **kwargs) @@ -253,7 +277,9 @@ class _LazyUnpickler(RestrictedUnpickler): typename == "storage" ), f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'" storage_type, key, location, _ = saved_id[1:] - return TorchLazyTensor(storage_type, key, location, file_handle=self.file_handle) + return TorchLazyTensor( + storage_type, key, location + ) def load(self, *args, **kwargs): retval = super().load(*args, **kwargs) @@ -277,117 +303,6 @@ def _rebuild_tensor(lazy_storage: LazyTensor, storage_offset, shape, stride): return lazy_storage -# Modified version of https://github.com/pytorch/pytorch/blob/v1.11.0-rc4/torch/nn/modules/module.py#L1346-L1438 -def _load_from_state_dict( - self, - state_dict, - prefix, - local_metadata, - strict, - missing_keys, - unexpected_keys, - error_msgs, -): - for hook in self._load_state_dict_pre_hooks.values(): - hook( - state_dict, - prefix, - local_metadata, - strict, - missing_keys, - unexpected_keys, - error_msgs, - ) - - persistent_buffers = { - k: v - for k, v in self._buffers.items() - if k not in self._non_persistent_buffers_set - } - local_name_params = itertools.chain( - self._parameters.items(), persistent_buffers.items() - ) - local_state = {k: v for k, v in local_name_params if v is not None} - - for name, param in local_state.items(): - key = prefix + name - if key in state_dict: - input_param = state_dict[key] - if not torch.overrides.is_tensor_like(input_param): - error_msgs.append( - 'While copying the parameter named "{}", ' - "expected torch.Tensor or Tensor-like object from checkpoint but " - "received {}".format(key, type(input_param)) - ) - continue - - # This is used to avoid copying uninitialized parameters into - # non-lazy modules, since they dont have the hook to do the checks - # in such case, it will error when accessing the .shape attribute. - is_param_lazy = torch.nn.parameter.is_lazy(param) - # Backward compatibility: loading 1-dim tensor from 0.3.* to version 0.4+ - if ( - not is_param_lazy - and len(param.shape) == 0 - and len(input_param.shape) == 1 - ): - input_param = input_param[0] - - if not is_param_lazy and input_param.shape != param.shape: - # local shape should match the one in checkpoint - error_msgs.append( - "size mismatch for {}: copying a param with shape {} from checkpoint, " - "the shape in current model is {}.".format( - key, input_param.shape, param.shape - ) - ) - continue - try: - with torch.no_grad(): - # param.copy_(input_param) - new_param = torch.nn.Parameter( - input_param, requires_grad=param.requires_grad - ) # This line is new - if name in self._parameters: # This line is new - self._parameters[name] = new_param # This line is new - if name in persistent_buffers: # This line is new - self._buffers[name] = new_param # This line is new - except Exception as ex: - error_msgs.append( - 'While copying the parameter named "{}", ' - "whose dimensions in the model are {} and " - "whose dimensions in the checkpoint are {}, " - "an exception occurred : {}.".format( - key, param.size(), input_param.size(), ex.args - ) - ) - elif strict: - missing_keys.append(key) - - extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX - if ( - hasattr(Module, "set_extra_state") - and getattr(self.__class__, "set_extra_state", Module.set_extra_state) - is not Module.set_extra_state - ): # if getattr(self.__class__, "set_extra_state", Module.set_extra_state) is not Module.set_extra_state: - if extra_state_key in state_dict: - self.set_extra_state(state_dict[extra_state_key]) - elif strict: - missing_keys.append(extra_state_key) - elif strict and (extra_state_key in state_dict): - unexpected_keys.append(extra_state_key) - - if strict: - for key in state_dict.keys(): - if key.startswith(prefix) and key != extra_state_key: - input_name = key[len(prefix) :] - input_name = input_name.split(".", 1)[ - 0 - ] # get the name of param/buffer/child - if input_name not in self._modules and input_name not in local_state: - unexpected_keys.append(key) - - def safetensors_load_tensor_independently( checkpoint_file: str, tensor_key: str, device: Any ) -> torch.Tensor: @@ -418,7 +333,9 @@ def patch_safetensors(callback): tensors = {} with safetensors.safe_open( - checkpoint_file, framework="pt", device=intermediary_device, + checkpoint_file, + framework="pt", + device=intermediary_device, ) as f: for key in f.keys(): tensors[key] = None @@ -426,7 +343,9 @@ def patch_safetensors(callback): for key in tensors.keys(): tensors[key] = SafetensorsLazyTensor( - checkpoint_file=checkpoint_file, key=key, location=intermediary_device, + checkpoint_file=checkpoint_file, + key=key, + location=intermediary_device, ) if callback is not None: @@ -442,6 +361,13 @@ def patch_safetensors(callback): transformers.modeling_utils.safe_load_file = safetensors_load +def get_torch_tensor_file(file: str, lazy_tensor: TorchLazyTensor): + with zipfile.ZipFile(file, "r") as z: + storage_key = lazy_tensor.key + ziproot = z.namelist()[0].split("/")[0] + f = z.open(f"{ziproot}/data/{storage_key}") + # TODO: Maybe some file seeking + return f @contextlib.contextmanager def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler): @@ -468,7 +394,6 @@ def use_lazy_load( enable=True, callback: Optional[Callable] = None, dematerialized_modules=False, - use_accelerate_init_empty_weights=False, ): if not enable: with use_custom_unpickler(RestrictedUnpickler): @@ -483,22 +408,30 @@ def use_lazy_load( old_torch_load = torch.load def torch_load(f, map_location=None, pickle_module=pickle, **pickle_load_args): - retval = old_torch_load( + model_dict = old_torch_load( f=f, map_location=map_location, pickle_module=pickle_module, **pickle_load_args, ) + + if f not in torch_checkpoint_file_handles: + torch_checkpoint_file_handles[f] = zipfile.ZipFile(f, "r") + + for k,v in model_dict.items(): + v.file_name = f + if callback is not None: callback( - retval, + model_dict, f=f, map_location=map_location, pickle_module=pickle_module, is_safetensors=False, **pickle_load_args, ) - return retval + + return model_dict torch.load = torch_load @@ -506,30 +439,10 @@ def use_lazy_load( patch_safetensors(callback) if dematerialized_modules: - if use_accelerate_init_empty_weights: - import accelerate + import accelerate - init_empty_weights = accelerate.init_empty_weights() - init_empty_weights.__enter__() - else: - old_linear_init = torch.nn.Linear.__init__ - old_embedding_init = torch.nn.Embedding.__init__ - old_layernorm_init = torch.nn.LayerNorm.__init__ - - def linear_init(self, *args, device=None, **kwargs): - return old_linear_init(self, *args, device="meta", **kwargs) - - def embedding_init(self, *args, device=None, **kwargs): - return old_embedding_init(self, *args, device="meta", **kwargs) - - def layernorm_init(self, *args, device=None, **kwargs): - return old_layernorm_init(self, *args, device="meta", **kwargs) - - torch.nn.Linear.__init__ = linear_init - torch.nn.Embedding.__init__ = embedding_init - torch.nn.LayerNorm.__init__ = layernorm_init - old_load_from_state_dict = torch.nn.Module._load_from_state_dict - torch.nn.Module._load_from_state_dict = _load_from_state_dict + init_empty_weights = accelerate.init_empty_weights() + init_empty_weights.__enter__() with use_custom_unpickler(_LazyUnpickler): yield True @@ -538,10 +451,4 @@ def use_lazy_load( torch._utils._rebuild_tensor = old_rebuild_tensor torch.load = old_torch_load if dematerialized_modules: - if use_accelerate_init_empty_weights: - init_empty_weights.__exit__(None, None, None) - else: - torch.nn.Linear.__init__ = old_linear_init - torch.nn.Embedding.__init__ = old_embedding_init - torch.nn.LayerNorm.__init__ = old_layernorm_init - torch.nn.Module._load_from_state_dict = old_load_from_state_dict + init_empty_weights.__exit__(None, None, None) diff --git a/modeling/patches.py b/modeling/patches.py index c8c070f8..23d0301c 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -190,7 +190,7 @@ def patch_transformers_for_lazyload() -> None: # BEGIN PATCH if isinstance(param, LazyTensor): - print("Materializing", param_name) + print(".", end="", flush=True) param = param.materialize() # END PATCH diff --git a/prompt_tuner.py b/prompt_tuner.py index b1cbf78d..174a2391 100644 --- a/prompt_tuner.py +++ b/prompt_tuner.py @@ -855,7 +855,7 @@ class TrainerBase(abc.ABC): lazy_load_callback.nested = False # Since we're using lazy loader, we need to figure out what the model's hidden layers are called - with lazy_loader.use_lazy_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True): + with lazy_loader.use_lazy_load(dematerialized_modules=True): try: metamodel = AutoModelForCausalLM.from_config(model_config) except Exception as e: From 14241fc1562dabbae34fe41ab08175a211c2edce Mon Sep 17 00:00:00 2001 From: somebody Date: Sun, 28 May 2023 13:03:24 -0500 Subject: [PATCH 03/37] Speed --- modeling/lazy_loader.py | 95 ++++++++++++++++++++--------------------- modeling/patches.py | 18 +++++--- 2 files changed, 58 insertions(+), 55 deletions(-) diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 85ed495d..8591bc96 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -46,7 +46,6 @@ POSSIBILITY OF SUCH DAMAGE. import contextlib from functools import reduce -import itertools import zipfile import pickle import torch @@ -54,8 +53,7 @@ import numpy as np import collections import _codecs import os -from torch.nn import Module -from typing import Any, Callable, Dict, Optional, Tuple, Type, Union +from typing import Any, Callable, Dict, Optional, Tuple, Type # Safetensors is a dependency for the local version, TPU/Colab doesn't # support it yet. @@ -85,6 +83,22 @@ STORAGE_TYPE_MAP = { # Storage of zipfile handles for each shard torch_checkpoint_file_handles = {} + +class CheckpointChunkCache: + """Storage for common checkpoint weight files to speed up loading. In order + for this to be effective at all, weights must be loaded in ascending order + of (key, seek_offset).""" + file_name = None + key = None + handle = None + + @classmethod + def clear(cls) -> None: + cls.file_name = None + cls.key = None + cls.handle = None + + class LazyTensor: pass @@ -121,36 +135,37 @@ class TorchLazyTensor(LazyTensor): def materialize( self, - checkpoint: Union[zipfile.ZipFile, zipfile.ZipExtFile] = None, map_location=None, no_grad=True, - filename="pytorch_model.bin", ) -> torch.Tensor: + checkpoint = torch_checkpoint_file_handles[self.file_name] + filename = os.path.basename(os.path.normpath(self.file_name)).split(".")[0] - # if f not in torch_tensor_container_file_map: - # torch_tensor_container_file_map[f] = [] + # Most of the operations are just seeks, let's see if we can optimize that. + if ( + CheckpointChunkCache.file_name != filename + or CheckpointChunkCache.key != self.key + or not CheckpointChunkCache.handle + ): + # Flush cache if invalid + print("!", end="", flush=True) - # with zipfile.ZipFile(f, "r") as z: - # paths = z.namelist() + if CheckpointChunkCache.handle: + CheckpointChunkCache.handle.close() - # for name in paths: - # val = name.split("/data/")[-1] - # if not val.isdecimal(): - # continue - # torch_tensor_container_file_map[f].append(int(val)) - # torch_tensor_container_file_map[f].sort() - # print(torch_tensor_container_file_map) + CheckpointChunkCache.file_name = filename + CheckpointChunkCache.key = self.key + try: + CheckpointChunkCache.handle = checkpoint.open( + f"archive/data/{self.key}", "r" + ) + except KeyError: + CheckpointChunkCache.handle = checkpoint.open( + f"{filename}/data/{self.key}", "r" + ) - - - - if not checkpoint: - checkpoint = torch_checkpoint_file_handles[self.file_name] - filename = self.file_name - - filename = os.path.basename(os.path.normpath(filename)).split(".")[0] size = reduce(lambda x, y: x * y, self.shape, 1) dtype = self.dtype nbytes = ( @@ -163,21 +178,12 @@ class TorchLazyTensor(LazyTensor): ) ) - if isinstance(checkpoint, zipfile.ZipFile): - try: - f = checkpoint.open(f"archive/data/{self.key}", "r") - except: - f = checkpoint.open(f"{filename}/data/{self.key}", "r") - f.seek(self.seek_offset, os.SEEK_CUR) - # f.read(self.seek_offset) - else: - f = checkpoint + assert isinstance(checkpoint, zipfile.ZipFile) - try: - storage = STORAGE_TYPE_MAP[dtype].from_buffer(f.read(nbytes), "little") - finally: - if isinstance(checkpoint, zipfile.ZipFile): - f.close() + CheckpointChunkCache.handle.seek(self.seek_offset, os.SEEK_SET) + storage = STORAGE_TYPE_MAP[dtype].from_buffer( + CheckpointChunkCache.handle.read(nbytes), "little" + ) storage = torch.serialization._get_restore_location(map_location)( storage, self.location @@ -277,9 +283,7 @@ class _LazyUnpickler(RestrictedUnpickler): typename == "storage" ), f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'" storage_type, key, location, _ = saved_id[1:] - return TorchLazyTensor( - storage_type, key, location - ) + return TorchLazyTensor(storage_type, key, location) def load(self, *args, **kwargs): retval = super().load(*args, **kwargs) @@ -361,13 +365,6 @@ def patch_safetensors(callback): transformers.modeling_utils.safe_load_file = safetensors_load -def get_torch_tensor_file(file: str, lazy_tensor: TorchLazyTensor): - with zipfile.ZipFile(file, "r") as z: - storage_key = lazy_tensor.key - ziproot = z.namelist()[0].split("/")[0] - f = z.open(f"{ziproot}/data/{storage_key}") - # TODO: Maybe some file seeking - return f @contextlib.contextmanager def use_custom_unpickler(unpickler: Type[pickle.Unpickler] = RestrictedUnpickler): @@ -418,7 +415,7 @@ def use_lazy_load( if f not in torch_checkpoint_file_handles: torch_checkpoint_file_handles[f] = zipfile.ZipFile(f, "r") - for k,v in model_dict.items(): + for k, v in model_dict.items(): v.file_name = f if callback is not None: diff --git a/modeling/patches.py b/modeling/patches.py index 23d0301c..52fe9e10 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -126,7 +126,6 @@ def patch_transformers_generation() -> None: transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ = new_init -CURRENT_CHECKPOINT = None def patch_transformers_for_lazyload() -> None: import torch import inspect @@ -158,8 +157,6 @@ def patch_transformers_for_lazyload() -> None: """ - print("DEVMAP", device_map) - # XXX: remaining features to implement to be fully compatible with _load_state_dict_into_model # - deepspeed zero 3 support # - need to copy metadata if any - see _load_state_dict_into_model @@ -186,13 +183,22 @@ def patch_transformers_for_lazyload() -> None: for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) - for param_name, param in state_dict.items(): +# BEGIN PATCH + for param_name, param in sorted( + state_dict.items(), + # State dict must be ordered in this manner to make the caching in + # lazy_loader.py effective + key=lambda x: ( + # NOTE: Assuming key is just decimal + int(x[1].key), + x[1].seek_offset, + ), + ): - # BEGIN PATCH if isinstance(param, LazyTensor): print(".", end="", flush=True) param = param.materialize() - # END PATCH +# END PATCH # First part of the test is always true as load_state_dict_keys always contains state_dict keys. if ( From ed0728188a2699c7776df00cdf488b4cabbeaf4b Mon Sep 17 00:00:00 2001 From: somebody Date: Sun, 28 May 2023 13:22:32 -0500 Subject: [PATCH 04/37] More cleaning --- .../generic_hf_torch/class.py | 1 + modeling/lazy_loader.py | 56 +++++++++++++++---- modeling/patches.py | 2 +- 3 files changed, 48 insertions(+), 11 deletions(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 539d2018..d4c254b8 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -238,6 +238,7 @@ class model_backend(HFTorchInferenceModel): shutil.rmtree("cache/") self.patch_embedding() + lazy_loader.post_load_cleanup() self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 8591bc96..54cbb912 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -87,13 +87,29 @@ torch_checkpoint_file_handles = {} class CheckpointChunkCache: """Storage for common checkpoint weight files to speed up loading. In order for this to be effective at all, weights must be loaded in ascending order - of (key, seek_offset).""" + of (key, seek_offset). + """ + + # There is considerable room for improvement here; we could peek into the + # state dict and preload the N most frequent weight files or something, but + # this first implementation is on par with the speed of whatever the + # previous callback did. + file_name = None key = None handle = None + hit_data = {"hits": 0, "misses": 0} + @classmethod - def clear(cls) -> None: + def clear(cls, unload_model: bool = False) -> None: + if unload_model: + cls.hit_data["hits"] = 0 + cls.hit_data["misses"] = 0 + + if cls.handle: + cls.handle.close() + cls.file_name = None cls.key = None cls.handle = None @@ -140,20 +156,22 @@ class TorchLazyTensor(LazyTensor): ) -> torch.Tensor: checkpoint = torch_checkpoint_file_handles[self.file_name] - filename = os.path.basename(os.path.normpath(self.file_name)).split(".")[0] - # Most of the operations are just seeks, let's see if we can optimize that. + # Often we are using the same weight file to store multiple tensors, so + # let's cache the file handle to maintain a seek position and other + # fast stuff. if ( CheckpointChunkCache.file_name != filename or CheckpointChunkCache.key != self.key or not CheckpointChunkCache.handle ): - # Flush cache if invalid + # Cache miss. Assuming weights are loaded in order of + # (key, seek_offset), this means we need to invalidate the cache. print("!", end="", flush=True) + CheckpointChunkCache.hit_data["misses"] += 1 - if CheckpointChunkCache.handle: - CheckpointChunkCache.handle.close() + CheckpointChunkCache.clear() CheckpointChunkCache.file_name = filename CheckpointChunkCache.key = self.key @@ -165,6 +183,10 @@ class TorchLazyTensor(LazyTensor): CheckpointChunkCache.handle = checkpoint.open( f"{filename}/data/{self.key}", "r" ) + else: + # Cache hit. Hip hip hooray! :^) + print(".", end="", flush=True) + CheckpointChunkCache.hit_data["hits"] += 1 size = reduce(lambda x, y: x * y, self.shape, 1) dtype = self.dtype @@ -173,7 +195,9 @@ class TorchLazyTensor(LazyTensor): if dtype is torch.bool else size * ( - (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits + (torch.finfo if self.dtype.is_floating_point else torch.iinfo)( + self.dtype + ).bits >> 3 ) ) @@ -181,14 +205,14 @@ class TorchLazyTensor(LazyTensor): assert isinstance(checkpoint, zipfile.ZipFile) CheckpointChunkCache.handle.seek(self.seek_offset, os.SEEK_SET) - storage = STORAGE_TYPE_MAP[dtype].from_buffer( + storage = STORAGE_TYPE_MAP[self.dtype].from_buffer( CheckpointChunkCache.handle.read(nbytes), "little" ) storage = torch.serialization._get_restore_location(map_location)( storage, self.location ) - tensor = torch.tensor([], dtype=storage.dtype, device=storage.device) + tensor = torch.tensor([], dtype=self.dtype, device=storage.device) tensor.set_(storage, 0, self.shape, self.stride) tensor.requires_grad = not no_grad and self.requires_grad tensor._backward_hooks = self.backward_hooks @@ -449,3 +473,15 @@ def use_lazy_load( torch.load = old_torch_load if dematerialized_modules: init_empty_weights.__exit__(None, None, None) + + +def post_load_cleanup() -> None: + global torch_checkpoint_file_handles + + print("CheckpointChunkCache Hit Data:", CheckpointChunkCache.hit_data) + CheckpointChunkCache.clear() + + for v in torch_checkpoint_file_handles.values(): + v.close() + + torch_checkpoint_file_handles = {} diff --git a/modeling/patches.py b/modeling/patches.py index 52fe9e10..5c0573f7 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -196,7 +196,7 @@ def patch_transformers_for_lazyload() -> None: ): if isinstance(param, LazyTensor): - print(".", end="", flush=True) + # Should always be true param = param.materialize() # END PATCH From ceaefa9f5ed932a230be950fe88dd8efa2472b88 Mon Sep 17 00:00:00 2001 From: somebody Date: Sun, 28 May 2023 14:57:45 -0500 Subject: [PATCH 05/37] Not quite --- .../generic_hf_torch/class.py | 2 +- modeling/inference_models/hf_torch.py | 3 +- modeling/lazy_loader.py | 35 +++++++++---------- modeling/patches.py | 7 ++-- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index d4c254b8..b45b002c 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -238,7 +238,7 @@ class model_backend(HFTorchInferenceModel): shutil.rmtree("cache/") self.patch_embedding() - lazy_loader.post_load_cleanup() + self.model.tie_weights() self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index abc3c54c..cc3b83c1 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -289,7 +289,8 @@ class HFTorchInferenceModel(HFInferenceModel): device_map = infer_auto_device_map( model, max_memory={0: "10GiB", 1: "7GiB", "cpu": "15GiB"}, - no_split_module_classes=["GPTJBlock"], + no_split_module_classes=["GPTJBlock", "OPTDecoderLayer"], + dtype="float16", ) return AutoModelForCausalLM.from_pretrained( diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 54cbb912..b6ea1623 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -46,6 +46,7 @@ POSSIBILITY OF SUCH DAMAGE. import contextlib from functools import reduce +import time import zipfile import pickle import torch @@ -55,6 +56,8 @@ import _codecs import os from typing import Any, Callable, Dict, Optional, Tuple, Type +from torch.storage import UntypedStorage + # Safetensors is a dependency for the local version, TPU/Colab doesn't # support it yet. try: @@ -65,21 +68,9 @@ except ModuleNotFoundError: HAS_SAFETENSORS = False import utils +from logger import logger -STORAGE_TYPE_MAP = { - torch.float64: torch.DoubleStorage, - torch.float32: torch.FloatStorage, - torch.float16: torch.HalfStorage, - torch.int64: torch.LongStorage, - torch.int32: torch.IntStorage, - torch.int16: torch.ShortStorage, - torch.int8: torch.CharStorage, - torch.uint8: torch.ByteStorage, - torch.bool: torch.BoolStorage, - torch.bfloat16: torch.BFloat16Storage, -} - # Storage of zipfile handles for each shard torch_checkpoint_file_handles = {} @@ -205,8 +196,8 @@ class TorchLazyTensor(LazyTensor): assert isinstance(checkpoint, zipfile.ZipFile) CheckpointChunkCache.handle.seek(self.seek_offset, os.SEEK_SET) - storage = STORAGE_TYPE_MAP[self.dtype].from_buffer( - CheckpointChunkCache.handle.read(nbytes), "little" + storage = UntypedStorage.from_buffer( + CheckpointChunkCache.handle.read(nbytes), "little", dtype=self.dtype ) storage = torch.serialization._get_restore_location(map_location)( @@ -421,6 +412,8 @@ def use_lazy_load( yield False return + begin_time = time.time() + try: old_rebuild_tensor = torch._utils._rebuild_tensor torch._utils._rebuild_tensor = _rebuild_tensor @@ -471,17 +464,23 @@ def use_lazy_load( finally: torch._utils._rebuild_tensor = old_rebuild_tensor torch.load = old_torch_load + + post_load_cleanup() + logger.debug( + f"[lazy_load] Context closed in {round(time.time() - begin_time, 2)} seconds." + ) + if dematerialized_modules: init_empty_weights.__exit__(None, None, None) def post_load_cleanup() -> None: + """Close dangling file pointers and clear caches after the load is complete.""" global torch_checkpoint_file_handles - print("CheckpointChunkCache Hit Data:", CheckpointChunkCache.hit_data) - CheckpointChunkCache.clear() + logger.debug(f"[lazy_load] CheckpointChunkCache Hit Data: {CheckpointChunkCache.hit_data}") + CheckpointChunkCache.clear(unload_model=True) for v in torch_checkpoint_file_handles.values(): v.close() - torch_checkpoint_file_handles = {} diff --git a/modeling/patches.py b/modeling/patches.py index 5c0573f7..8cc436b5 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -184,6 +184,10 @@ def patch_transformers_for_lazyload() -> None: state_dict[new_key] = state_dict.pop(old_key) # BEGIN PATCH + # TODO: Based on config + dtype = torch.float16 + set_module_kwargs = {"dtype": dtype} + for param_name, param in sorted( state_dict.items(), # State dict must be ordered in this manner to make the caching in @@ -211,7 +215,6 @@ def patch_transformers_for_lazyload() -> None: param_name = param_name[len(start_prefix) :] module_name = param_name - set_module_kwargs = {} # We convert floating dtypes to the `dtype` passed. We want to keep the buffers/params # in int/uint/bool and not cast them. @@ -272,7 +275,7 @@ def patch_transformers_for_lazyload() -> None: elif not load_in_8bit: # For backward compatibility with older versions of `accelerate` set_module_tensor_to_device( - model, param_name, param_device, **set_module_kwargs + model, tensor_name=param_name, device=param_device, **set_module_kwargs ) else: if ( From 58ffad237bc5bcd28631889f9f4a7bf9186669b5 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 29 May 2023 13:34:11 -0500 Subject: [PATCH 06/37] OPT hack --- aiserver.py | 6 +- modeling/inference_models/hf_torch.py | 18 ++++- modeling/patches.py | 109 ++++++++++++++++++++++---- 3 files changed, 113 insertions(+), 20 deletions(-) diff --git a/aiserver.py b/aiserver.py index ad1efdab..390d1979 100644 --- a/aiserver.py +++ b/aiserver.py @@ -27,12 +27,12 @@ from ansi2html import Ansi2HTMLConverter logging.getLogger("urllib3").setLevel(logging.ERROR) -import attention_bias -attention_bias.do_patches() - from modeling import patches patches.patch_transformers_for_lazyload() +import attention_bias +attention_bias.do_patches() + from os import path, getcwd import time import re diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index cc3b83c1..b00132be 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -9,6 +9,7 @@ import functools import itertools import traceback import contextlib +from accelerate.big_modeling import load_checkpoint_and_dispatch from accelerate.utils.modeling import infer_auto_device_map, load_checkpoint_in_model from tqdm.auto import tqdm from typing import Dict, List, Optional, Union @@ -263,6 +264,9 @@ class HFTorchInferenceModel(HFInferenceModel): tf_kwargs["revision"] = utils.koboldai_vars.revision tf_kwargs["cache_dir"] = "cache" + if self.lazy_load: + tf_kwargs.pop("low_cpu_mem_usage", None) + # If we have model hints for legacy model, use them rather than fall back. try: if self.model_name == "GPT2Custom": @@ -285,17 +289,25 @@ class HFTorchInferenceModel(HFInferenceModel): # offload_state_dict=True # ) # model.tie_weights() + no_split_module_classes = ["GPTJBlock", "OPTDecoderLayer"] + print("[HUGE SKELETON] MAKING DEVICE MAP") device_map = infer_auto_device_map( model, max_memory={0: "10GiB", 1: "7GiB", "cpu": "15GiB"}, - no_split_module_classes=["GPTJBlock", "OPTDecoderLayer"], + no_split_module_classes=no_split_module_classes, dtype="float16", ) + print("[HUGE SKELETON] TYING WEIGHTS") + model.tie_weights() + + print("[HUGE SKELETON] LOADING FROM PRETRAINED") return AutoModelForCausalLM.from_pretrained( - location, device_map=device_map - ) # , **tf_kwargs) + location, + device_map=device_map, + **tf_kwargs, + ) except Exception as e: traceback_string = traceback.format_exc().lower() diff --git a/modeling/patches.py b/modeling/patches.py index 8cc436b5..79aefe9d 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -13,6 +13,7 @@ from transformers import ( from modeling.lazy_loader import LazyTensor import utils +from logger import logger def patch_transformers_download(): @@ -127,8 +128,27 @@ def patch_transformers_generation() -> None: def patch_transformers_for_lazyload() -> None: + """ + Most of the code is modified code from the Accelerate and Transformers + projects, made by HuggingFace. The license for these projects are as follows: + --- + Copyright The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ import torch - import inspect + import accelerate + from accelerate.utils.modeling import named_module_tensors from accelerate.utils import set_module_tensor_to_device, offload_weight def _load_state_dict_into_meta_model( @@ -183,10 +203,9 @@ def patch_transformers_for_lazyload() -> None: for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) -# BEGIN PATCH + # BEGIN PATCH # TODO: Based on config dtype = torch.float16 - set_module_kwargs = {"dtype": dtype} for param_name, param in sorted( state_dict.items(), @@ -202,7 +221,7 @@ def patch_transformers_for_lazyload() -> None: if isinstance(param, LazyTensor): # Should always be true param = param.materialize() -# END PATCH + # END PATCH # First part of the test is always true as load_state_dict_keys always contains state_dict keys. if ( @@ -228,13 +247,6 @@ def patch_transformers_for_lazyload() -> None: and dtype == torch.float16 ): param = param.to(torch.float32) - - # For backward compatibility with older versions of `accelerate` - # TODO: @sgugger replace this check with version check at the next `accelerate` release - if "dtype" in list( - inspect.signature(set_module_tensor_to_device).parameters - ): - set_module_kwargs["dtype"] = torch.float32 else: param = param.to(dtype) @@ -250,8 +262,6 @@ def patch_transformers_for_lazyload() -> None: if old_param is not None: param = param.to(old_param.dtype) - set_module_kwargs["value"] = param - if device_map is None: param_device = "cpu" else: @@ -263,6 +273,7 @@ def patch_transformers_for_lazyload() -> None: # TODO: group all errors and raise at the end. raise ValueError(f"{param_name} doesn't have any device set.") param_device = device_map[module_name] + if param_device == "disk": if not is_safetensors: offload_index = offload_weight( @@ -275,7 +286,10 @@ def patch_transformers_for_lazyload() -> None: elif not load_in_8bit: # For backward compatibility with older versions of `accelerate` set_module_tensor_to_device( - model, tensor_name=param_name, device=param_device, **set_module_kwargs + model, + tensor_name=param_name, + device=param_device, + value=param, ) else: if ( @@ -301,6 +315,73 @@ def patch_transformers_for_lazyload() -> None: _load_state_dict_into_meta_model ) + # Patch AlignDevicesHook to hack around OPT lm_head + HACK_ZERO_ON_FAIL_TENSORS = ["lm_head.weight"] + + def _recursed_key_value(module, key): + """Gets a tensor from a recursive key (ie with .s)""" + if "." in key: + splits = key.split(".") + for split in splits[:-1]: + new_module = getattr(module, split) + if new_module is None: + raise ValueError(f"{module} has no attribute {split}.") + module = new_module + key = splits[-1] + return getattr(module, key) + + def _init_hook(self, module): + if not self.offload and self.execution_device is not None: + # BEGIN PATCH + for name, tensor in named_module_tensors( + module, recurse=self.place_submodules + ): + try: + set_module_tensor_to_device(module, name, self.execution_device) + except ValueError: + # ValueError: weight is on the meta device, we need a `value` to put in on 0. + # bleuuuuuuuuuuuuuuuhhh + if name in HACK_ZERO_ON_FAIL_TENSORS: + logger.warning(f"Couldn't find value for weight {name}, zeroing.") + set_module_tensor_to_device( + module, + name, + self.execution_device, + value=torch.zeros(tensor.shape), + ) + # END PATCH + elif self.offload: + self.original_devices = { + name: param.device + for name, param in named_module_tensors( + module, recurse=self.place_submodules + ) + } + + if self.weights_map is None: + self.weights_map = { + name: param.to("cpu") + for name, param in named_module_tensors( + module, + include_buffers=self.offload_buffers, + recurse=self.place_submodules, + ) + } + + for name, _ in named_module_tensors( + module, + include_buffers=self.offload_buffers, + recurse=self.place_submodules, + ): + set_module_tensor_to_device(module, name, "meta") + + if not self.offload_buffers and self.execution_device is not None: + for name, _ in module.named_buffers(recurse=self.place_submodules): + set_module_tensor_to_device(module, name, self.execution_device) + return module + + accelerate.hooks.AlignDevicesHook.init_hook = _init_hook + def patch_transformers() -> None: patch_transformers_download() From 146806eba1afa0a1c1578a3ef3e06f242ebfee96 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 29 May 2023 13:36:15 -0500 Subject: [PATCH 07/37] Remove old function --- modeling/patches.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/modeling/patches.py b/modeling/patches.py index 79aefe9d..2b823b3b 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -318,18 +318,6 @@ def patch_transformers_for_lazyload() -> None: # Patch AlignDevicesHook to hack around OPT lm_head HACK_ZERO_ON_FAIL_TENSORS = ["lm_head.weight"] - def _recursed_key_value(module, key): - """Gets a tensor from a recursive key (ie with .s)""" - if "." in key: - splits = key.split(".") - for split in splits[:-1]: - new_module = getattr(module, split) - if new_module is None: - raise ValueError(f"{module} has no attribute {split}.") - module = new_module - key = splits[-1] - return getattr(module, key) - def _init_hook(self, module): if not self.offload and self.execution_device is not None: # BEGIN PATCH From ac4384ef75be8f1ac1eb3ebcfecccf898ccd0c7b Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 31 May 2023 10:55:46 -0500 Subject: [PATCH 08/37] Auto _no_split_modules --- modeling/inference_models/hf_torch.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index b00132be..24c0dbb9 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -280,26 +280,16 @@ class HFTorchInferenceModel(HFInferenceModel): try: model = AutoModelForCausalLM.from_config(self.model_config) - # load_checkpoint_in_model( - # model.model, - # location, - # device_map=device_map - # offload_folder="accelerate-disk-cache", - # dtype="float16", - # offload_state_dict=True - # ) - # model.tie_weights() - no_split_module_classes = ["GPTJBlock", "OPTDecoderLayer"] - print("[HUGE SKELETON] MAKING DEVICE MAP") device_map = infer_auto_device_map( model, max_memory={0: "10GiB", 1: "7GiB", "cpu": "15GiB"}, - no_split_module_classes=no_split_module_classes, + no_split_module_classes=model._no_split_modules, dtype="float16", ) - print("[HUGE SKELETON] TYING WEIGHTS") + # TODO: ?? + # print("[HUGE SKELETON] TYING WEIGHTS") model.tie_weights() print("[HUGE SKELETON] LOADING FROM PRETRAINED") From d0d215bb37f4eb81e0355b78315d505e52743c25 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 31 May 2023 10:59:03 -0500 Subject: [PATCH 09/37] I have no idea why this was removed i swear it was there --- modeling/patches.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modeling/patches.py b/modeling/patches.py index 2b823b3b..203d98c9 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -290,6 +290,7 @@ def patch_transformers_for_lazyload() -> None: tensor_name=param_name, device=param_device, value=param, + dtype=dtype, ) else: if ( From 24b0b32829923614835b2411097f56afd895417b Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 31 May 2023 14:31:08 -0500 Subject: [PATCH 10/37] Maybe works now...? --- .../generic_hf_torch/class.py | 246 +++++------ modeling/inference_models/hf_torch.py | 414 ++---------------- modeling/patches.py | 126 +++--- 3 files changed, 239 insertions(+), 547 deletions(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index b45b002c..f879cb37 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -71,6 +71,9 @@ class model_backend(HFTorchInferenceModel): ) if self.lazy_load: + # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time + tf_kwargs.pop("low_cpu_mem_usage", None) + # If we're using lazy loader, we need to figure out what the model's hidden layers are called with lazy_loader.use_lazy_load(dematerialized_modules=True): try: @@ -83,145 +86,92 @@ class model_backend(HFTorchInferenceModel): self.lazy_load = False # Download model from Huggingface if it does not exist, otherwise load locally - with self._maybe_use_float16(), lazy_loader.use_lazy_load( - enable=self.lazy_load, - callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) - if self.lazy_load - else None, - dematerialized_modules=True, - ): - if self.lazy_load: - # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time - tf_kwargs.pop("low_cpu_mem_usage", None) + if self.get_local_model_path(): + # Model is stored locally, load it. + self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + else: + # Model not stored locally, we need to download it. - if self.get_local_model_path(): - # Model is stored locally, load it. - self.model = self._get_model(self.get_local_model_path(), tf_kwargs) - self.tokenizer = self._get_tokenizer(self.get_local_model_path()) - else: - # Model not stored locally, we need to download it. + # _rebuild_tensor patch for casting dtype and supporting LazyTensors + old_rebuild_tensor = torch._utils._rebuild_tensor - # _rebuild_tensor patch for casting dtype and supporting LazyTensors - old_rebuild_tensor = torch._utils._rebuild_tensor + def new_rebuild_tensor( + storage: Union[lazy_loader.LazyTensor, torch.Storage], + storage_offset, + shape, + stride, + ): + if not isinstance(storage, lazy_loader.LazyTensor): + dtype = storage.dtype + else: + dtype = storage.storage_type.dtype + if not isinstance(dtype, torch.dtype): + dtype = storage.storage_type(0).dtype + if dtype is torch.float32 and len(shape) >= 2: + utils.koboldai_vars.fp32_model = True + return old_rebuild_tensor(storage, storage_offset, shape, stride) - def new_rebuild_tensor( - storage: Union[lazy_loader.LazyTensor, torch.Storage], - storage_offset, - shape, - stride, - ): - if not isinstance(storage, lazy_loader.LazyTensor): - dtype = storage.dtype - else: - dtype = storage.storage_type.dtype - if not isinstance(dtype, torch.dtype): - dtype = storage.storage_type(0).dtype - if dtype is torch.float32 and len(shape) >= 2: - utils.koboldai_vars.fp32_model = True - return old_rebuild_tensor(storage, storage_offset, shape, stride) + torch._utils._rebuild_tensor = new_rebuild_tensor + self.model = self._get_model(self.model_name, tf_kwargs) + self.tokenizer = self._get_tokenizer(self.model_name) + torch._utils._rebuild_tensor = old_rebuild_tensor - torch._utils._rebuild_tensor = new_rebuild_tensor - self.model = self._get_model(self.model_name, tf_kwargs) - self.tokenizer = self._get_tokenizer(self.model_name) - torch._utils._rebuild_tensor = old_rebuild_tensor + if save_model: + self.tokenizer.save_pretrained( + self.get_local_model_path(ignore_existance=True) + ) - if save_model: - self.tokenizer.save_pretrained( - self.get_local_model_path(ignore_existance=True) + if utils.koboldai_vars.fp32_model: + # Use save_pretrained to convert fp32 models to fp16, + # unless we are using disk cache because save_pretrained + # is not supported in that case + self.model = self.model.half() + self.model.save_pretrained( + self.get_local_model_path(ignore_existance=True), + max_shard_size="500MiB", ) - if utils.koboldai_vars.fp32_model: - # Use save_pretrained to convert fp32 models to fp16, - # unless we are using disk cache because save_pretrained - # is not supported in that case - self.model = self.model.half() - self.model.save_pretrained( - self.get_local_model_path(ignore_existance=True), - max_shard_size="500MiB", - ) + else: + # For fp16 models, we can just copy the model files directly + import transformers.configuration_utils + import transformers.modeling_utils + import transformers.file_utils + import huggingface_hub - else: - # For fp16 models, we can just copy the model files directly - import transformers.configuration_utils - import transformers.modeling_utils - import transformers.file_utils - import huggingface_hub - - # Save the config.json - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - self.model_name, - transformers.configuration_utils.CONFIG_NAME, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=False, - ) - ), - os.path.join( - self.get_local_model_path(ignore_existance=True), + # Save the config.json + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, transformers.configuration_utils.CONFIG_NAME, - ), - ) - - if utils.num_shards is None: - # Save the pytorch_model.bin or model.safetensors of an unsharded model - any_success = False - possible_checkpoint_names = [ - transformers.modeling_utils.WEIGHTS_NAME, - "model.safetensors", - ] - - for possible_checkpoint_name in possible_checkpoint_names: - try: - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - self.model_name, - possible_checkpoint_name, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=False, - ) - ), - os.path.join( - self.get_local_model_path( - ignore_existance=True - ), - possible_checkpoint_name, - ), - ) - any_success = True - except Exception: - pass - - if not any_success: - raise RuntimeError( - f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'" - ) - else: - # Handle saving sharded models - - with open(utils.from_pretrained_index_filename) as f: - map_data = json.load(f) - filenames = set(map_data["weight_map"].values()) - # Save the pytorch_model.bin.index.json of a sharded model - shutil.move( - os.path.realpath(utils.from_pretrained_index_filename), - os.path.join( - self.get_local_model_path(ignore_existance=True), - transformers.modeling_utils.WEIGHTS_INDEX_NAME, - ), + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, ) - # Then save the pytorch_model-#####-of-#####.bin files - for filename in filenames: + ), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.configuration_utils.CONFIG_NAME, + ), + ) + + if utils.num_shards is None: + # Save the pytorch_model.bin or model.safetensors of an unsharded model + any_success = False + possible_checkpoint_names = [ + transformers.modeling_utils.WEIGHTS_NAME, + "model.safetensors", + ] + + for possible_checkpoint_name in possible_checkpoint_names: + try: shutil.move( os.path.realpath( huggingface_hub.hf_hub_download( self.model_name, - filename, + possible_checkpoint_name, revision=utils.koboldai_vars.revision, cache_dir="cache", local_files_only=True, @@ -232,13 +182,53 @@ class model_backend(HFTorchInferenceModel): self.get_local_model_path( ignore_existance=True ), - filename, + possible_checkpoint_name, ), ) - shutil.rmtree("cache/") + any_success = True + except Exception: + pass + + if not any_success: + raise RuntimeError( + f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'" + ) + else: + # Handle saving sharded models + + with open(utils.from_pretrained_index_filename) as f: + map_data = json.load(f) + filenames = set(map_data["weight_map"].values()) + # Save the pytorch_model.bin.index.json of a sharded model + shutil.move( + os.path.realpath(utils.from_pretrained_index_filename), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.modeling_utils.WEIGHTS_INDEX_NAME, + ), + ) + # Then save the pytorch_model-#####-of-#####.bin files + for filename in filenames: + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, + filename, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path(ignore_existance=True), + filename, + ), + ) + shutil.rmtree("cache/") self.patch_embedding() - self.model.tie_weights() + # self.model.tie_weights() self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 24c0dbb9..151857cf 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -278,26 +278,57 @@ class HFTorchInferenceModel(HFInferenceModel): # Try to determine model type from either AutoModel or falling back to legacy try: - model = AutoModelForCausalLM.from_config(self.model_config) + # with accelerate.init_empty_weights(): + # model = AutoModelForCausalLM.from_config(self.model_config) - print("[HUGE SKELETON] MAKING DEVICE MAP") - device_map = infer_auto_device_map( - model, - max_memory={0: "10GiB", 1: "7GiB", "cpu": "15GiB"}, - no_split_module_classes=model._no_split_modules, - dtype="float16", - ) + # print("[HUGE SKELETON] MAKING DEVICE MAP") + # device_map = infer_auto_device_map( + # model, + # no_split_module_classes=model._no_split_modules, + # max_memory={0: "10GiB", 1: "7GiB", "cpu": "20GiB"}, + # dtype=torch.float16, + # ) - # TODO: ?? + # # TODO: ?? # print("[HUGE SKELETON] TYING WEIGHTS") - model.tie_weights() + # model.tie_weights() print("[HUGE SKELETON] LOADING FROM PRETRAINED") - return AutoModelForCausalLM.from_pretrained( - location, - device_map=device_map, - **tf_kwargs, - ) + # model = load_checkpoint_and_dispatch( + # model, + # location + "/pytorch_model.bin", + # device_map=device_map, + # no_split_module_classes=model._no_split_modules, + # dtype=torch.float16, + # ) + with lazy_loader.use_lazy_load( + enable=True, + # dematerialized_modules=True, + dematerialized_modules=False, + ): + model = AutoModelForCausalLM.from_pretrained( + location, + device_map="auto", + max_memory={0: "10GiB", 1: "7GiB", "cpu": "20GiB"}, + offload_folder="accelerate-disk-cache", + torch_dtype=torch.float16, + **tf_kwargs, + ) + + for name, value in list(model.named_parameters()) + list( + model.named_buffers() + ): + if value.device != torch.device("meta"): + continue + print(name, value, value.nelement()) + # try: + # value.cpu() + # except NotImplementedError: + # # Can't be copied out of meta tensor, no data + # print("Bad news at", name) + # # setattr(model, name, torch.zeros(value.size())) + + return model except Exception as e: traceback_string = traceback.format_exc().lower() @@ -377,359 +408,6 @@ class HFTorchInferenceModel(HFInferenceModel): Embedding.__call__ = new_embedding_call Embedding._koboldai_patch_causallm_model = self.model - def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True): - if not self.lazy_load: - return - - # disk_blocks = breakmodel.disk_blocks - # gpu_blocks = breakmodel.gpu_blocks - # ram_blocks = ram_blocks = n_layers - sum(gpu_blocks) - # cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - - def lazy_load_callback( - model_dict: Dict[str, Union[lazy_loader.LazyTensor, torch.Tensor]], - f, - is_safetensors: bool = False, - **_, - ): - if lazy_load_callback.nested: - return - lazy_load_callback.nested = True - return - - device_map: Dict[str, Union[str, int]] = {} - - @functools.lru_cache(maxsize=None) - def get_original_key(key) -> Optional[str]: - key_candidates = [ - original_key - for original_key in utils.module_names - if original_key.endswith(key) - ] - - if not key_candidates: - logger.debug(f"!!! No key candidates for {key}") - return None - - return max(key_candidates, key=len) - - for key, value in model_dict.items(): - original_key = get_original_key(key) - - if not original_key: - continue - - if isinstance(value, lazy_loader.LazyTensor) and not any( - original_key.startswith(n) for n in utils.layers_module_names - ): - device_map[key] = ( - utils.koboldai_vars.gpu_device - if utils.koboldai_vars.hascuda and self.usegpu - else "cpu" - if not utils.koboldai_vars.hascuda or not self.breakmodel - else breakmodel.primary_device - ) - else: - layer = int( - max( - ( - n - for n in utils.layers_module_names - if original_key.startswith(n) - ), - key=len, - ).rsplit(".", 1)[1] - ) - device = ( - utils.koboldai_vars.gpu_device - if utils.koboldai_vars.hascuda and self.usegpu - else "disk" - if layer < disk_blocks and layer < ram_blocks - else "cpu" - if not utils.koboldai_vars.hascuda or not self.breakmodel - else "shared" - if layer < ram_blocks - else bisect.bisect_right( - cumulative_gpu_blocks, layer - ram_blocks - ) - ) - device_map[key] = device - - if utils.num_shards is None or utils.current_shard == 0: - utils.offload_index = {} - if os.path.isdir("accelerate-disk-cache"): - # Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder - # (the folder doesn't contain any subfolders so os.remove will do just fine) - for filename in os.listdir("accelerate-disk-cache"): - try: - os.remove(os.path.join("accelerate-disk-cache", filename)) - except OSError: - pass - os.makedirs("accelerate-disk-cache", exist_ok=True) - if utils.num_shards is not None: - num_tensors = len( - utils.get_sharded_checkpoint_num_tensors( - utils.from_pretrained_model_name, - utils.from_pretrained_index_filename, - is_safetensors=is_safetensors, - **utils.from_pretrained_kwargs, - ) - ) - else: - num_tensors = len(device_map) - print(flush=True) - utils.koboldai_vars.status_message = "Loading model" - utils.koboldai_vars.total_layers = num_tensors - utils.koboldai_vars.loaded_layers = 0 - utils.bar = tqdm( - total=num_tensors, - desc="Loading model tensors", - file=utils.UIProgressBarFile(), - position=1, - ) - - if not is_safetensors: - # Torch lazyload - with zipfile.ZipFile(f, "r") as z: - try: - last_storage_key = None - zipfolder = os.path.basename(os.path.normpath(f)).split(".")[0] - f = None - current_offset = 0 - able_to_pin_layers = True - if utils.num_shards is not None: - utils.current_shard += 1 - for key in sorted( - device_map.keys(), - key=lambda k: ( - model_dict[k].key, - model_dict[k].seek_offset, - ), - ): - storage_key = model_dict[key].key - if ( - storage_key != last_storage_key - or model_dict[key].seek_offset < current_offset - ): - last_storage_key = storage_key - if isinstance(f, zipfile.ZipExtFile): - f.close() - ziproot = z.namelist()[0].split("/")[0] - f = z.open(f"{ziproot}/data/{storage_key}") - - current_offset = 0 - if current_offset != model_dict[key].seek_offset: - f.read(model_dict[key].seek_offset - current_offset) - current_offset = model_dict[key].seek_offset - device = device_map[key] - size = functools.reduce( - lambda x, y: x * y, model_dict[key].shape, 1 - ) - dtype = model_dict[key].dtype - nbytes = ( - size - if dtype is torch.bool - else size - * ( - ( - torch.finfo - if dtype.is_floating_point - else torch.iinfo - )(dtype).bits - >> 3 - ) - ) - # print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) - # logger.debug(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ") - model_dict[key] = model_dict[key].materialize( - f, map_location="cpu" - ) - if model_dict[key].dtype is torch.float32: - utils.koboldai_vars.fp32_model = True - if ( - convert_to_float16 - and breakmodel.primary_device != "cpu" - and utils.koboldai_vars.hascuda - and (self.breakmodel or self.usegpu) - and model_dict[key].dtype is torch.float32 - ): - model_dict[key] = model_dict[key].to(torch.float16) - if breakmodel.primary_device == "cpu" or ( - not self.usegpu - and not self.breakmodel - and model_dict[key].dtype is torch.float16 - ): - model_dict[key] = model_dict[key].to(torch.float32) - if device == "shared": - model_dict[key] = model_dict[key].to("cpu").detach_() - if able_to_pin_layers: - try: - model_dict[key] = model_dict[key].pin_memory() - except: - able_to_pin_layers = False - elif device == "disk": - accelerate.utils.offload_weight( - model_dict[key], - get_original_key(key), - "accelerate-disk-cache", - index=utils.offload_index, - ) - model_dict[key] = model_dict[key].to("meta") - else: - model_dict[key] = model_dict[key].to(device) - # print("OK", flush=True) - current_offset += nbytes - utils.bar.update(1) - utils.koboldai_vars.loaded_layers += 1 - finally: - if ( - utils.num_shards is None - or utils.current_shard >= utils.num_shards - ): - if utils.offload_index: - for name, tensor in utils.named_buffers: - dtype = tensor.dtype - if ( - convert_to_float16 - and breakmodel.primary_device != "cpu" - and utils.koboldai_vars.hascuda - and (self.breakmodel or self.usegpu) - ): - dtype = torch.float16 - if breakmodel.primary_device == "cpu" or ( - not self.usegpu and not self.breakmodel - ): - dtype = torch.float32 - if ( - name in model_dict - and model_dict[name].dtype is not dtype - ): - model_dict[name] = model_dict[name].to(dtype) - if tensor.dtype is not dtype: - tensor = tensor.to(dtype) - if name not in utils.offload_index: - accelerate.utils.offload_weight( - tensor, - name, - "accelerate-disk-cache", - index=utils.offload_index, - ) - accelerate.utils.save_offload_index( - utils.offload_index, "accelerate-disk-cache" - ) - utils.bar.close() - utils.bar = None - utils.koboldai_vars.status_message = "" - lazy_load_callback.nested = False - if isinstance(f, zipfile.ZipExtFile): - f.close() - else: - # Loading with safetensors - try: - able_to_pin_layers = True - - if utils.num_shards is not None: - utils.current_shard += 1 - - for key in sorted( - device_map.keys(), - key=lambda k: model_dict[k].key, - ): - storage_key = model_dict[key].key - - device = device_map[key] - - # print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) - - model_dict[key] = model_dict[key].materialize( - f, map_location="cpu" - ) - - if model_dict[key].dtype is torch.float32: - utils.koboldai_vars.fp32_model = True - - if ( - convert_to_float16 - and breakmodel.primary_device != "cpu" - and utils.koboldai_vars.hascuda - and (self.breakmodel or self.usegpu) - and model_dict[key].dtype is torch.float32 - ): - model_dict[key] = model_dict[key].to(torch.float16) - - if breakmodel.primary_device == "cpu" or ( - not self.usegpu - and not self.breakmodel - and model_dict[key].dtype is torch.float16 - ): - model_dict[key] = model_dict[key].to(torch.float32) - - if device == "shared": - model_dict[key] = model_dict[key].to("cpu").detach_() - if able_to_pin_layers: - try: - model_dict[key] = model_dict[key].pin_memory() - except: - able_to_pin_layers = False - elif device == "disk": - accelerate.utils.offload_weight( - model_dict[key], - get_original_key(key), - "accelerate-disk-cache", - index=utils.offload_index, - ) - model_dict[key] = model_dict[key].to("meta") - else: - model_dict[key] = model_dict[key].to(device) - - utils.bar.update(1) - utils.koboldai_vars.loaded_layers += 1 - - finally: - if ( - utils.num_shards is None - or utils.current_shard >= utils.num_shards - ): - if utils.offload_index: - for name, tensor in utils.named_buffers: - dtype = tensor.dtype - if ( - convert_to_float16 - and breakmodel.primary_device != "cpu" - and utils.koboldai_vars.hascuda - and (self.breakmodel or self.usegpu) - ): - dtype = torch.float16 - if breakmodel.primary_device == "cpu" or ( - not self.usegpu and not self.breakmodel - ): - dtype = torch.float32 - if ( - name in model_dict - and model_dict[name].dtype is not dtype - ): - model_dict[name] = model_dict[name].to(dtype) - if tensor.dtype is not dtype: - tensor = tensor.to(dtype) - if name not in utils.offload_index: - accelerate.utils.offload_weight( - tensor, - name, - "accelerate-disk-cache", - index=utils.offload_index, - ) - accelerate.utils.save_offload_index( - utils.offload_index, "accelerate-disk-cache" - ) - utils.bar.close() - utils.bar = None - utils.koboldai_vars.status_message = "" - - lazy_load_callback.nested = False - - lazy_load_callback.nested = False - return lazy_load_callback - @contextlib.contextmanager def _maybe_use_float16(self, always_use: bool = False): if always_use or ( diff --git a/modeling/patches.py b/modeling/patches.py index 203d98c9..7393b6ba 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -1,8 +1,9 @@ from __future__ import annotations import copy +from ctypes import Union import requests -from typing import Iterable, List +from typing import Iterable, List, Optional from tqdm.auto import tqdm import transformers @@ -148,6 +149,27 @@ def patch_transformers_for_lazyload() -> None: """ import torch import accelerate + + # _old_set_module_tensor_to_device = ( + # accelerate.utils.modeling.set_module_tensor_to_device + # ) + + # def _set_module_tensor_to_device( + # module: torch.nn.Module, + # tensor_name: str, + # device: Union[int, str, torch.device], + # value: Optional[torch.Tensor] = None, + # dtype: Optional[Union[str, torch.dtype]] = None, + # ): + # if isinstance(value, LazyTensor): + # value = value.materialize() + # print("HEY!", dtype) + # return _old_set_module_tensor_to_device( + # module, tensor_name, device, value, dtype + # ) + + # accelerate.utils.modeling.set_module_tensor_to_device = _set_module_tensor_to_device + from accelerate.utils.modeling import named_module_tensors from accelerate.utils import set_module_tensor_to_device, offload_weight @@ -205,7 +227,7 @@ def patch_transformers_for_lazyload() -> None: # BEGIN PATCH # TODO: Based on config - dtype = torch.float16 + # dtype = torch.float16 for param_name, param in sorted( state_dict.items(), @@ -316,60 +338,62 @@ def patch_transformers_for_lazyload() -> None: _load_state_dict_into_meta_model ) - # Patch AlignDevicesHook to hack around OPT lm_head - HACK_ZERO_ON_FAIL_TENSORS = ["lm_head.weight"] + # # Patch AlignDevicesHook to hack around OPT lm_head + # HACK_ZERO_ON_FAIL_TENSORS = ["lm_head.weight"] - def _init_hook(self, module): - if not self.offload and self.execution_device is not None: - # BEGIN PATCH - for name, tensor in named_module_tensors( - module, recurse=self.place_submodules - ): - try: - set_module_tensor_to_device(module, name, self.execution_device) - except ValueError: - # ValueError: weight is on the meta device, we need a `value` to put in on 0. - # bleuuuuuuuuuuuuuuuhhh - if name in HACK_ZERO_ON_FAIL_TENSORS: - logger.warning(f"Couldn't find value for weight {name}, zeroing.") - set_module_tensor_to_device( - module, - name, - self.execution_device, - value=torch.zeros(tensor.shape), - ) - # END PATCH - elif self.offload: - self.original_devices = { - name: param.device - for name, param in named_module_tensors( - module, recurse=self.place_submodules - ) - } + # def _init_hook(self, module): + # if not self.offload and self.execution_device is not None: + # # BEGIN PATCH + # for name, tensor in named_module_tensors( + # module, recurse=self.place_submodules + # ): + # try: + # set_module_tensor_to_device(module, name, self.execution_device) + # except ValueError: + # # ValueError: weight is on the meta device, we need a `value` to put in on 0. + # # bleuuuuuuuuuuuuuuuhhh + # if name in HACK_ZERO_ON_FAIL_TENSORS: + # logger.warning( + # f"Couldn't find value for weight {name}, zeroing." + # ) + # set_module_tensor_to_device( + # module, + # name, + # self.execution_device, + # value=torch.zeros(tensor.shape), + # ) + # # END PATCH + # elif self.offload: + # self.original_devices = { + # name: param.device + # for name, param in named_module_tensors( + # module, recurse=self.place_submodules + # ) + # } - if self.weights_map is None: - self.weights_map = { - name: param.to("cpu") - for name, param in named_module_tensors( - module, - include_buffers=self.offload_buffers, - recurse=self.place_submodules, - ) - } + # if self.weights_map is None: + # self.weights_map = { + # name: param.to("cpu") + # for name, param in named_module_tensors( + # module, + # include_buffers=self.offload_buffers, + # recurse=self.place_submodules, + # ) + # } - for name, _ in named_module_tensors( - module, - include_buffers=self.offload_buffers, - recurse=self.place_submodules, - ): - set_module_tensor_to_device(module, name, "meta") + # for name, _ in named_module_tensors( + # module, + # include_buffers=self.offload_buffers, + # recurse=self.place_submodules, + # ): + # set_module_tensor_to_device(module, name, "meta") - if not self.offload_buffers and self.execution_device is not None: - for name, _ in module.named_buffers(recurse=self.place_submodules): - set_module_tensor_to_device(module, name, self.execution_device) - return module + # if not self.offload_buffers and self.execution_device is not None: + # for name, _ in module.named_buffers(recurse=self.place_submodules): + # set_module_tensor_to_device(module, name, self.execution_device) + # return module - accelerate.hooks.AlignDevicesHook.init_hook = _init_hook + # accelerate.hooks.AlignDevicesHook.init_hook = _init_hook def patch_transformers() -> None: From f326fc07e8da6d64b63faac82dd91c101b6bd27b Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 31 May 2023 14:42:05 -0500 Subject: [PATCH 11/37] Seems to work --- modeling/inference_models/hf_torch.py | 38 +----------- modeling/patches.py | 87 +-------------------------- 2 files changed, 4 insertions(+), 121 deletions(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 151857cf..1fb78717 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -278,56 +278,22 @@ class HFTorchInferenceModel(HFInferenceModel): # Try to determine model type from either AutoModel or falling back to legacy try: - # with accelerate.init_empty_weights(): - # model = AutoModelForCausalLM.from_config(self.model_config) - - # print("[HUGE SKELETON] MAKING DEVICE MAP") - # device_map = infer_auto_device_map( - # model, - # no_split_module_classes=model._no_split_modules, - # max_memory={0: "10GiB", 1: "7GiB", "cpu": "20GiB"}, - # dtype=torch.float16, - # ) - - # # TODO: ?? - # print("[HUGE SKELETON] TYING WEIGHTS") - # model.tie_weights() - print("[HUGE SKELETON] LOADING FROM PRETRAINED") - # model = load_checkpoint_and_dispatch( - # model, - # location + "/pytorch_model.bin", - # device_map=device_map, - # no_split_module_classes=model._no_split_modules, - # dtype=torch.float16, - # ) with lazy_loader.use_lazy_load( enable=True, + # DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!! # dematerialized_modules=True, dematerialized_modules=False, ): model = AutoModelForCausalLM.from_pretrained( location, device_map="auto", - max_memory={0: "10GiB", 1: "7GiB", "cpu": "20GiB"}, + # max_memory={0: "10GiB", 1: "7GiB", "cpu": "20GiB"}, offload_folder="accelerate-disk-cache", torch_dtype=torch.float16, **tf_kwargs, ) - for name, value in list(model.named_parameters()) + list( - model.named_buffers() - ): - if value.device != torch.device("meta"): - continue - print(name, value, value.nelement()) - # try: - # value.cpu() - # except NotImplementedError: - # # Can't be copied out of meta tensor, no data - # print("Bad news at", name) - # # setattr(model, name, torch.zeros(value.size())) - return model except Exception as e: traceback_string = traceback.format_exc().lower() diff --git a/modeling/patches.py b/modeling/patches.py index 7393b6ba..bd5a6325 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -148,29 +148,6 @@ def patch_transformers_for_lazyload() -> None: limitations under the License. """ import torch - import accelerate - - # _old_set_module_tensor_to_device = ( - # accelerate.utils.modeling.set_module_tensor_to_device - # ) - - # def _set_module_tensor_to_device( - # module: torch.nn.Module, - # tensor_name: str, - # device: Union[int, str, torch.device], - # value: Optional[torch.Tensor] = None, - # dtype: Optional[Union[str, torch.dtype]] = None, - # ): - # if isinstance(value, LazyTensor): - # value = value.materialize() - # print("HEY!", dtype) - # return _old_set_module_tensor_to_device( - # module, tensor_name, device, value, dtype - # ) - - # accelerate.utils.modeling.set_module_tensor_to_device = _set_module_tensor_to_device - - from accelerate.utils.modeling import named_module_tensors from accelerate.utils import set_module_tensor_to_device, offload_weight def _load_state_dict_into_meta_model( @@ -225,10 +202,7 @@ def patch_transformers_for_lazyload() -> None: for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) - # BEGIN PATCH - # TODO: Based on config - # dtype = torch.float16 - +# BEGIN PATCH for param_name, param in sorted( state_dict.items(), # State dict must be ordered in this manner to make the caching in @@ -243,7 +217,7 @@ def patch_transformers_for_lazyload() -> None: if isinstance(param, LazyTensor): # Should always be true param = param.materialize() - # END PATCH +# END PATCH # First part of the test is always true as load_state_dict_keys always contains state_dict keys. if ( @@ -338,63 +312,6 @@ def patch_transformers_for_lazyload() -> None: _load_state_dict_into_meta_model ) - # # Patch AlignDevicesHook to hack around OPT lm_head - # HACK_ZERO_ON_FAIL_TENSORS = ["lm_head.weight"] - - # def _init_hook(self, module): - # if not self.offload and self.execution_device is not None: - # # BEGIN PATCH - # for name, tensor in named_module_tensors( - # module, recurse=self.place_submodules - # ): - # try: - # set_module_tensor_to_device(module, name, self.execution_device) - # except ValueError: - # # ValueError: weight is on the meta device, we need a `value` to put in on 0. - # # bleuuuuuuuuuuuuuuuhhh - # if name in HACK_ZERO_ON_FAIL_TENSORS: - # logger.warning( - # f"Couldn't find value for weight {name}, zeroing." - # ) - # set_module_tensor_to_device( - # module, - # name, - # self.execution_device, - # value=torch.zeros(tensor.shape), - # ) - # # END PATCH - # elif self.offload: - # self.original_devices = { - # name: param.device - # for name, param in named_module_tensors( - # module, recurse=self.place_submodules - # ) - # } - - # if self.weights_map is None: - # self.weights_map = { - # name: param.to("cpu") - # for name, param in named_module_tensors( - # module, - # include_buffers=self.offload_buffers, - # recurse=self.place_submodules, - # ) - # } - - # for name, _ in named_module_tensors( - # module, - # include_buffers=self.offload_buffers, - # recurse=self.place_submodules, - # ): - # set_module_tensor_to_device(module, name, "meta") - - # if not self.offload_buffers and self.execution_device is not None: - # for name, _ in module.named_buffers(recurse=self.place_submodules): - # set_module_tensor_to_device(module, name, self.execution_device) - # return module - - # accelerate.hooks.AlignDevicesHook.init_hook = _init_hook - def patch_transformers() -> None: patch_transformers_download() From 0052ad401a9bb18a709897b2f796c464835ee590 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 13:57:32 -0500 Subject: [PATCH 12/37] Basic breakmodel ui support Seems to work --- .../generic_hf_torch/class.py | 12 +- modeling/inference_models/hf_torch.py | 122 +++++++++++------- 2 files changed, 83 insertions(+), 51 deletions(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index f879cb37..c65b3ab6 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -59,8 +59,6 @@ class model_backend(HFTorchInferenceModel): # Also, lazy loader doesn't support GPT-2 models self.lazy_load = False - # If we're using torch_lazy_loader, we need to get breakmodel config - # early so that it knows where to load the individual model tensors logger.debug( "lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format( self.lazy_load, @@ -70,6 +68,16 @@ class model_backend(HFTorchInferenceModel): ) ) + # If we're using torch_lazy_loader, we need to get breakmodel config + # early so that it knows where to load the individual model tensors + if ( + self.lazy_load + and utils.koboldai_vars.hascuda + and utils.koboldai_vars.breakmodel + and not utils.koboldai_vars.nobreakmodel + ): + self.breakmodel_device_config(self.model_config) + if self.lazy_load: # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time tf_kwargs.pop("low_cpu_mem_usage", None) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 1fb78717..c1bcdf0b 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -1,17 +1,13 @@ from __future__ import annotations +from dataclasses import dataclass -import gc import os import time import bisect -import zipfile -import functools import itertools import traceback import contextlib -from accelerate.big_modeling import load_checkpoint_and_dispatch -from accelerate.utils.modeling import infer_auto_device_map, load_checkpoint_in_model -from tqdm.auto import tqdm +from torch import nn from typing import Dict, List, Optional, Union import torch @@ -41,17 +37,36 @@ from modeling.inference_model import ( use_core_manipulations, ) -try: - import accelerate.utils -except ModuleNotFoundError as e: - if not utils.koboldai_vars.use_colab_tpu: - raise e - # When set to true, messages will appear in the console if samplers are not # changing the scores. Keep in mind some samplers don't always change the # scores for each token. LOG_SAMPLER_NO_EFFECT = False +class BreakmodelConfig: + def __init__(self) -> None: + self.disk_blocks = 0 + self.gpu_blocks = [] + self.primary_device = 0 if torch.cuda.device_count() > 0 else "cpu" + + def get_device_map(self, model: nn.Module) -> dict: + ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks) + cumulative_gpu_blocks = tuple(itertools.accumulate(self.gpu_blocks)) + device_map = {} + + for name in utils.layers_module_names: + layer = int(name.rsplit(".", 1)[1]) + device = ( + ("disk" if layer < self.disk_blocks else "cpu") + if layer < ram_blocks + else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) + ) + device_map[name] = device + + for name in utils.get_missing_module_names(model, list(device_map.keys())): + device_map[name] = self.primary_device + + return device_map + class HFTorchInferenceModel(HFInferenceModel): def __init__(self) -> None: @@ -80,6 +95,16 @@ class HFTorchInferenceModel(HFInferenceModel): post_token_probs=True, ) self._old_stopping_criteria = None + self.breakmodel_config = BreakmodelConfig() + + def set_input_parameters(self, parameters): + ret = super().set_input_parameters(parameters) + + # Hook onto input param setting for setting breakmodel stuff + self.breakmodel_config.gpu_blocks = self.layers + self.breakmodel_config.disk_blocks = self.disk_layers + + return ret def _apply_warpers( self, scores: torch.Tensor, input_ids: torch.Tensor @@ -278,17 +303,20 @@ class HFTorchInferenceModel(HFInferenceModel): # Try to determine model type from either AutoModel or falling back to legacy try: - print("[HUGE SKELETON] LOADING FROM PRETRAINED") + with lazy_loader.use_lazy_load(dematerialized_modules=True): + metamodel = AutoModelForCausalLM.from_config(self.model_config) + device_map = self.breakmodel_config.get_device_map(metamodel) + with lazy_loader.use_lazy_load( enable=True, # DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!! - # dematerialized_modules=True, dematerialized_modules=False, ): + print(device_map) model = AutoModelForCausalLM.from_pretrained( location, - device_map="auto", - # max_memory={0: "10GiB", 1: "7GiB", "cpu": "20GiB"}, + # device_map="auto", + device_map=device_map, offload_folder="accelerate-disk-cache", torch_dtype=torch.float16, **tf_kwargs, @@ -389,18 +417,19 @@ class HFTorchInferenceModel(HFInferenceModel): yield False def breakmodel_device_list(self, n_layers, primary=None, selected=None): - return - # TODO: Find a better place for this or rework this - device_count = torch.cuda.device_count() if device_count < 2: primary = None + logger.debug("n_layers: {}".format(n_layers)) - logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks)) - gpu_blocks = breakmodel.gpu_blocks + ( - device_count - len(breakmodel.gpu_blocks) + logger.debug("gpu blocks: {}".format(self.breakmodel_config.gpu_blocks)) + + gpu_blocks = self.breakmodel_config.gpu_blocks + ( + device_count - len(self.breakmodel_config.gpu_blocks) ) * [0] + print(f"{Colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{Colors.END}") + for i in range(device_count): name = torch.cuda.get_device_name(i) if len(name) > 47: @@ -410,75 +439,70 @@ class HFTorchInferenceModel(HFInferenceModel): print( f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{Colors.END}" ) + row_color = Colors.END sep_color = Colors.YELLOW print( - f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}" + f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {self.breakmodel_config.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}" ) print( f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){Colors.END}" ) def breakmodel_device_config(self, config): - # TODO: Find a better place for this or rework this - return - - global breakmodel, generator - import breakmodel - n_layers = utils.num_layers(config) - logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks)) + logger.debug("gpu blocks before modification: {}".format(self.breakmodel_config.gpu_blocks)) if utils.args.cpu: - breakmodel.gpu_blocks = [0] * n_layers + self.breakmodel_config.gpu_blocks = [0] * n_layers return - elif breakmodel.gpu_blocks == []: + elif self.breakmodel_config.gpu_blocks == []: logger.info("Breakmodel not specified, assuming GPU 0") - breakmodel.gpu_blocks = [n_layers] + self.breakmodel_config.gpu_blocks = [n_layers] n_layers = 0 else: s = n_layers - for i in range(len(breakmodel.gpu_blocks)): - if breakmodel.gpu_blocks[i] <= -1: - breakmodel.gpu_blocks[i] = s + for i in range(len(self.breakmodel_config.gpu_blocks)): + if self.breakmodel_config.gpu_blocks[i] <= -1: + self.breakmodel_config.gpu_blocks[i] = s break else: - s -= breakmodel.gpu_blocks[i] - assert sum(breakmodel.gpu_blocks) <= n_layers - n_layers -= sum(breakmodel.gpu_blocks) - if breakmodel.disk_blocks is not None: - assert breakmodel.disk_blocks <= n_layers - n_layers -= breakmodel.disk_blocks + s -= self.breakmodel_config.gpu_blocks[i] + assert sum(self.breakmodel_config.gpu_blocks) <= n_layers + n_layers -= sum(self.breakmodel_config.gpu_blocks) + if self.breakmodel_config.disk_blocks is not None: + assert self.breakmodel_config.disk_blocks <= n_layers + n_layers -= self.breakmodel_config.disk_blocks logger.init_ok("Final device configuration:", status="Info") - self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device) + self.breakmodel_device_list(n_layers, primary=self.breakmodel_config.primary_device) with open( "settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w" ) as file: file.write( "{}\n{}".format( - ",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks + ",".join(map(str, self.breakmodel_config.gpu_blocks)), self.breakmodel_config.disk_blocks ) ) # If all layers are on the same device, use the old GPU generation mode - while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0: - breakmodel.gpu_blocks.pop() + while len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[-1] == 0: + self.breakmodel_config.gpu_blocks.pop() self.breakmodel = True - if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in ( + if len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[-1] in ( -1, utils.num_layers(config), ): logger.debug("All layers on same GPU. Breakmodel disabled") self.breakmodel = False self.usegpu = True - utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1 + utils.koboldai_vars.gpu_device = len(self.breakmodel_config.gpu_blocks) - 1 return - if not breakmodel.gpu_blocks: + if not self.breakmodel_config.gpu_blocks: logger.warning("Nothing assigned to a GPU, reverting to CPU only mode") self.breakmodel = False self.usegpu = False From 5f224e1366f94fd4c431e9698483b1036a34b2f4 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 14:13:14 -0500 Subject: [PATCH 13/37] Restore choice of lazyload or not --- modeling/inference_models/hf_torch.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index c1bcdf0b..10b0fa3c 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -101,8 +101,9 @@ class HFTorchInferenceModel(HFInferenceModel): ret = super().set_input_parameters(parameters) # Hook onto input param setting for setting breakmodel stuff - self.breakmodel_config.gpu_blocks = self.layers - self.breakmodel_config.disk_blocks = self.disk_layers + if self.breakmodel: + self.breakmodel_config.gpu_blocks = self.layers + self.breakmodel_config.disk_blocks = self.disk_layers return ret @@ -303,20 +304,19 @@ class HFTorchInferenceModel(HFInferenceModel): # Try to determine model type from either AutoModel or falling back to legacy try: - with lazy_loader.use_lazy_load(dematerialized_modules=True): - metamodel = AutoModelForCausalLM.from_config(self.model_config) - device_map = self.breakmodel_config.get_device_map(metamodel) + if self.lazy_load: + with lazy_loader.use_lazy_load(dematerialized_modules=True): + metamodel = AutoModelForCausalLM.from_config(self.model_config) + tf_kwargs["device_map"] = self.breakmodel_config.get_device_map(metamodel) + print("Rodger rodger", tf_kwargs) with lazy_loader.use_lazy_load( - enable=True, + enable=self.lazy_load, # DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!! dematerialized_modules=False, ): - print(device_map) model = AutoModelForCausalLM.from_pretrained( location, - # device_map="auto", - device_map=device_map, offload_folder="accelerate-disk-cache", torch_dtype=torch.float16, **tf_kwargs, From aca2b532d7f74e9babfcb561a10397fb8f38f808 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 14:15:38 -0500 Subject: [PATCH 14/37] Remove debug --- modeling/inference_models/hf_torch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 10b0fa3c..9a941cf6 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -308,7 +308,6 @@ class HFTorchInferenceModel(HFInferenceModel): with lazy_loader.use_lazy_load(dematerialized_modules=True): metamodel = AutoModelForCausalLM.from_config(self.model_config) tf_kwargs["device_map"] = self.breakmodel_config.get_device_map(metamodel) - print("Rodger rodger", tf_kwargs) with lazy_loader.use_lazy_load( enable=self.lazy_load, From c56214c275807a29e081ea5841fcadfe804b805f Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 16:27:22 -0500 Subject: [PATCH 15/37] Fix loading bar --- modeling/lazy_loader.py | 9 +++++++-- modeling/patches.py | 7 ++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index b6ea1623..c14c7967 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -159,7 +159,7 @@ class TorchLazyTensor(LazyTensor): ): # Cache miss. Assuming weights are loaded in order of # (key, seek_offset), this means we need to invalidate the cache. - print("!", end="", flush=True) + # print("!", end="", flush=True) CheckpointChunkCache.hit_data["misses"] += 1 CheckpointChunkCache.clear() @@ -176,7 +176,7 @@ class TorchLazyTensor(LazyTensor): ) else: # Cache hit. Hip hip hooray! :^) - print(".", end="", flush=True) + # print(".", end="", flush=True) CheckpointChunkCache.hit_data["hits"] += 1 size = reduce(lambda x, y: x * y, self.shape, 1) @@ -481,6 +481,11 @@ def post_load_cleanup() -> None: logger.debug(f"[lazy_load] CheckpointChunkCache Hit Data: {CheckpointChunkCache.hit_data}") CheckpointChunkCache.clear(unload_model=True) + # Bar is initialized in + # patches.patch_transformers_for_lazyload._load_state_dict_into_meta_model, + # as it has access to the state dict (for getting tensor count) + utils.bar = None + for v in torch_checkpoint_file_handles.values(): v.close() torch_checkpoint_file_handles = {} diff --git a/modeling/patches.py b/modeling/patches.py index bd5a6325..71319ef8 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -1,9 +1,8 @@ from __future__ import annotations import copy -from ctypes import Union import requests -from typing import Iterable, List, Optional +from typing import List from tqdm.auto import tqdm import transformers @@ -14,7 +13,6 @@ from transformers import ( from modeling.lazy_loader import LazyTensor import utils -from logger import logger def patch_transformers_download(): @@ -203,6 +201,8 @@ def patch_transformers_for_lazyload() -> None: state_dict[new_key] = state_dict.pop(old_key) # BEGIN PATCH + utils.bar = tqdm(total=len(state_dict), desc="Loading model tensors", file=utils.UIProgressBarFile()) + for param_name, param in sorted( state_dict.items(), # State dict must be ordered in this manner to make the caching in @@ -217,6 +217,7 @@ def patch_transformers_for_lazyload() -> None: if isinstance(param, LazyTensor): # Should always be true param = param.materialize() + utils.bar.update(1) # END PATCH # First part of the test is always true as load_state_dict_keys always contains state_dict keys. From 70f113141c533b142ed60aefb7557cdcde34ffd2 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 16:40:12 -0500 Subject: [PATCH 16/37] Fix Transformers 4.30 --- modeling/patches.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/modeling/patches.py b/modeling/patches.py index 71319ef8..83d157b4 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -151,7 +151,7 @@ def patch_transformers_for_lazyload() -> None: def _load_state_dict_into_meta_model( model, state_dict, - loaded_state_dict_keys, # left for now but could be removed, see below + loaded_state_dict_keys, start_prefix, expected_keys, device_map=None, @@ -160,28 +160,17 @@ def patch_transformers_for_lazyload() -> None: state_dict_folder=None, state_dict_index=None, dtype=None, + # PATCH: load_in_8bit was renamed to is_quantized in Transformers 4.30, keep + # both for short term compatibility load_in_8bit=False, + is_quantized=False, + is_safetensors=False, keep_in_fp32_modules=None, ): - """ - This is somewhat similar to `_load_state_dict_into_model`, but deals with a model that has some or all of its - params on a `meta` device. It replaces the model params with the data from the `state_dict`, while moving the - params back to the normal device, but only for `loaded_state_dict_keys`. + is_quantized = is_quantized or load_in_8bit - `start_prefix` is used for models which insert their name into model keys, e.g. `bert` in - `bert.pooler.dense.weight` - - """ - - # XXX: remaining features to implement to be fully compatible with _load_state_dict_into_model - # - deepspeed zero 3 support - # - need to copy metadata if any - see _load_state_dict_into_model - # - handling error_msgs - mimicking the error handling in module._load_from_state_dict() - # - Is there a situation where some keys aren't in `loaded_state_dict_keys` and in which case - # they won't get loaded. - - if load_in_8bit: + if is_quantized: from .utils.bitsandbytes import set_module_8bit_tensor_to_device error_msgs = [] @@ -280,7 +269,7 @@ def patch_transformers_for_lazyload() -> None: state_dict_index = offload_weight( param, param_name, state_dict_folder, state_dict_index ) - elif not load_in_8bit: + elif not is_quantized: # For backward compatibility with older versions of `accelerate` set_module_tensor_to_device( model, From c40649a74e18651d54795c460ccd06dd4acb92f5 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 16:53:30 -0500 Subject: [PATCH 17/37] Probably fix f32 --- modeling/inference_models/hf_torch.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 9a941cf6..8d06ff6e 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -107,6 +107,13 @@ class HFTorchInferenceModel(HFInferenceModel): return ret + def _get_target_dtype(self) -> Union[torch.float16, torch.float32]: + if self.breakmodel_config.primary_device == "cpu": + return torch.float32 + elif utils.args.cpu: + return torch.float32 + return torch.float16 + def _apply_warpers( self, scores: torch.Tensor, input_ids: torch.Tensor ) -> torch.Tensor: @@ -317,7 +324,7 @@ class HFTorchInferenceModel(HFInferenceModel): model = AutoModelForCausalLM.from_pretrained( location, offload_folder="accelerate-disk-cache", - torch_dtype=torch.float16, + torch_dtype=self._get_target_dtype(), **tf_kwargs, ) From 0012158eac006e65d2d428c11eeabbb39745a347 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 16:58:59 -0500 Subject: [PATCH 18/37] Remove old --- modeling/inference_models/generic_hf_torch/class.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index 34f1d85c..ad17b85b 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -236,7 +236,6 @@ class model_backend(HFTorchInferenceModel): shutil.rmtree("cache/") self.patch_embedding() - # self.model.tie_weights() self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() From 947bcc58e486385f9a8bdcc5d1509e28cb9d60c4 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 17:33:14 -0500 Subject: [PATCH 19/37] Experiments --- aiserver.py | 2 +- modeling/inference_models/hf_torch.py | 18 +++++++++++++++++- utils.py | 3 +++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/aiserver.py b/aiserver.py index cbb15dc0..b2d22e56 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3779,7 +3779,7 @@ def calcsubmit(txt): bias[i] = b["multiplier"] - device = model.get_auxilary_device() + device = utils.get_auxilary_device() attention_bias.attention_bias = torch.Tensor(bias).to(device) logger.info(f"Bias by {koboldai_vars.memory_attn_bias} -- {attention_bias.attention_bias}") logger.debug("Submit: experimental_features time {}s".format(time.time()-start_time)) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 8d06ff6e..a10a48f3 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -46,9 +46,14 @@ class BreakmodelConfig: def __init__(self) -> None: self.disk_blocks = 0 self.gpu_blocks = [] + self.primary_device = 0 if torch.cuda.device_count() > 0 else "cpu" def get_device_map(self, model: nn.Module) -> dict: + # HACK + if utils.args.cpu: + self.primary_device = "cpu" + ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks) cumulative_gpu_blocks = tuple(itertools.accumulate(self.gpu_blocks)) device_map = {} @@ -311,10 +316,21 @@ class HFTorchInferenceModel(HFInferenceModel): # Try to determine model type from either AutoModel or falling back to legacy try: + print(f"self.lazy_load {self.lazy_load}") + print(f"self.breakmodel {self.breakmodel}") + print(f"self.nobreakmodel {self.nobreakmodel}") + print(f"args.cpu {utils.args.cpu}") + if self.lazy_load: with lazy_loader.use_lazy_load(dematerialized_modules=True): metamodel = AutoModelForCausalLM.from_config(self.model_config) - tf_kwargs["device_map"] = self.breakmodel_config.get_device_map(metamodel) + if utils.args.cpu: + cpu_map = {name: "cpu" for name in utils.layers_module_names} + for name in utils.get_missing_module_names(metamodel, list(cpu_map.keys())): + cpu_map[name] = "cpu" + tf_kwargs["device_map"] = cpu_map + else: + tf_kwargs["device_map"] = self.breakmodel_config.get_device_map(metamodel) with lazy_loader.use_lazy_load( enable=self.lazy_load, diff --git a/utils.py b/utils.py index 863bda2f..0d30f194 100644 --- a/utils.py +++ b/utils.py @@ -655,10 +655,13 @@ def get_auxilary_device(): # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU. if koboldai_vars.hascuda and koboldai_vars.usegpu: + print("GP") return koboldai_vars.gpu_device elif koboldai_vars.hascuda: # TODO: Primary device + print("CUDA") return "cuda" + print("CPU") return "cpu" #==================================================================# From 5278174a62fa08bedcac04da605abcbeb47044cf Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 17:40:47 -0500 Subject: [PATCH 20/37] Materialize on cpu --- modeling/patches.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modeling/patches.py b/modeling/patches.py index 83d157b4..dff10b93 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -205,7 +205,7 @@ def patch_transformers_for_lazyload() -> None: if isinstance(param, LazyTensor): # Should always be true - param = param.materialize() + param = param.materialize(map_location="cpu") utils.bar.update(1) # END PATCH From d4b923a054f4735191cb769acc4d55f9ef51b976 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 17:41:15 -0500 Subject: [PATCH 21/37] Remove debug --- utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/utils.py b/utils.py index 0d30f194..863bda2f 100644 --- a/utils.py +++ b/utils.py @@ -655,13 +655,10 @@ def get_auxilary_device(): # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU. if koboldai_vars.hascuda and koboldai_vars.usegpu: - print("GP") return koboldai_vars.gpu_device elif koboldai_vars.hascuda: # TODO: Primary device - print("CUDA") return "cuda" - print("CPU") return "cpu" #==================================================================# From b81f61b8209c54d1325ff9a0803d01b62f226f38 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 18:35:56 -0500 Subject: [PATCH 22/37] Clean debug --- modeling/inference_models/hf_torch.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index a10a48f3..6bcd88cd 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -110,6 +110,9 @@ class HFTorchInferenceModel(HFInferenceModel): self.breakmodel_config.gpu_blocks = self.layers self.breakmodel_config.disk_blocks = self.disk_layers + # HACK: Prevent get_auxiliary_device from returning cuda + utils.koboldai_vars.hascuda = self.usegpu + return ret def _get_target_dtype(self) -> Union[torch.float16, torch.float32]: @@ -117,6 +120,8 @@ class HFTorchInferenceModel(HFInferenceModel): return torch.float32 elif utils.args.cpu: return torch.float32 + elif not self.usegpu: + return torch.float32 return torch.float16 def _apply_warpers( @@ -316,11 +321,6 @@ class HFTorchInferenceModel(HFInferenceModel): # Try to determine model type from either AutoModel or falling back to legacy try: - print(f"self.lazy_load {self.lazy_load}") - print(f"self.breakmodel {self.breakmodel}") - print(f"self.nobreakmodel {self.nobreakmodel}") - print(f"args.cpu {utils.args.cpu}") - if self.lazy_load: with lazy_loader.use_lazy_load(dematerialized_modules=True): metamodel = AutoModelForCausalLM.from_config(self.model_config) @@ -344,6 +344,13 @@ class HFTorchInferenceModel(HFInferenceModel): **tf_kwargs, ) + if not self.lazy_load: + # We need to move the model to the desired device + if (not self.usegpu) or torch.cuda.device_count() <= 0: + model = model.to("cpu") + else: + model = model.to("cuda") + return model except Exception as e: traceback_string = traceback.format_exc().lower() From 5ee20bd7d635bc54a748e460f00d6caf2181dd29 Mon Sep 17 00:00:00 2001 From: somebody Date: Wed, 21 Jun 2023 21:18:43 -0500 Subject: [PATCH 23/37] Fix for CPU loading --- modeling/inference_models/hf_torch.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 6bcd88cd..514a1e5b 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -50,8 +50,12 @@ class BreakmodelConfig: self.primary_device = 0 if torch.cuda.device_count() > 0 else "cpu" def get_device_map(self, model: nn.Module) -> dict: - # HACK - if utils.args.cpu: + if ( + # Explicitly CPU-only + utils.args.cpu + # No blocks are on GPU + or not sum(self.gpu_blocks) + ): self.primary_device = "cpu" ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks) From 1da4580e8bf91b8497bc5a022a7818f6aefe1113 Mon Sep 17 00:00:00 2001 From: Henk Date: Thu, 22 Jun 2023 07:07:02 +0200 Subject: [PATCH 24/37] Remove wrong usegpu behavior --- modeling/inference_models/hf_torch.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 514a1e5b..7719f022 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -114,9 +114,6 @@ class HFTorchInferenceModel(HFInferenceModel): self.breakmodel_config.gpu_blocks = self.layers self.breakmodel_config.disk_blocks = self.disk_layers - # HACK: Prevent get_auxiliary_device from returning cuda - utils.koboldai_vars.hascuda = self.usegpu - return ret def _get_target_dtype(self) -> Union[torch.float16, torch.float32]: @@ -124,8 +121,6 @@ class HFTorchInferenceModel(HFInferenceModel): return torch.float32 elif utils.args.cpu: return torch.float32 - elif not self.usegpu: - return torch.float32 return torch.float16 def _apply_warpers( From 81e72329af1485223bf1652886d6b64ecbaee6cd Mon Sep 17 00:00:00 2001 From: Henk Date: Sun, 2 Jul 2023 21:50:23 +0200 Subject: [PATCH 25/37] CPU fixes --- modeling/inference_models/hf_torch.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 7719f022..2c035e62 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -121,6 +121,8 @@ class HFTorchInferenceModel(HFInferenceModel): return torch.float32 elif utils.args.cpu: return torch.float32 + elif not self.usegpu and not self.breakmodel: + return torch.float32 return torch.float16 def _apply_warpers( @@ -268,9 +270,11 @@ class HFTorchInferenceModel(HFInferenceModel): gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] else: gen_in = prompt_tokens - - device = utils.get_auxilary_device() - gen_in = gen_in.to(device) + if not self.usegpu and not self.breakmodel: + gen_in = gen_in.to("cpu") + else: + device = utils.get_auxilary_device() + gen_in = gen_in.to(device) additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else [] From 51c462269419c83b9fe8ac776e5b1490179ae091 Mon Sep 17 00:00:00 2001 From: Henk Date: Sun, 2 Jul 2023 22:27:26 +0200 Subject: [PATCH 26/37] Hide all Nvidia GPU's in CPU mode --- aiserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/aiserver.py b/aiserver.py index b2d22e56..613c735f 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1528,6 +1528,7 @@ def general_startup(override_args=None): print(f"Allowed IPs: {allowed_ips}") if args.cpu: + os.environ['CUDA_VISIBLE_DEVICES'] = "None" koboldai_vars.use_colab_tpu = False koboldai_vars.hascuda = False koboldai_vars.usegpu = False From 4a45a0e551a2c5c6a63642f92bad26fad8d62991 Mon Sep 17 00:00:00 2001 From: Henk Date: Sun, 2 Jul 2023 23:02:02 +0200 Subject: [PATCH 27/37] Fix bar spam --- modeling/patches.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modeling/patches.py b/modeling/patches.py index dff10b93..b7c5370a 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -190,7 +190,7 @@ def patch_transformers_for_lazyload() -> None: state_dict[new_key] = state_dict.pop(old_key) # BEGIN PATCH - utils.bar = tqdm(total=len(state_dict), desc="Loading model tensors", file=utils.UIProgressBarFile()) + utils.bar = tqdm(total=len(state_dict), desc="Loading model tensors", file=utils.UIProgressBarFile(), position=1) for param_name, param in sorted( state_dict.items(), From 062e3ed27c8104732da45e6d3a70971a617a766b Mon Sep 17 00:00:00 2001 From: Henk Date: Sun, 2 Jul 2023 23:19:20 +0200 Subject: [PATCH 28/37] Torch bump --- requirements_mtj.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_mtj.txt b/requirements_mtj.txt index 1f4359f6..e3f91282 100644 --- a/requirements_mtj.txt +++ b/requirements_mtj.txt @@ -1,4 +1,4 @@ -torch >= 1.9, < 1.13 +torch == 2.0 numpy tqdm requests From 686c3d1592dbb97198e6ba62ce33a825ba60e3a6 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 3 Jul 2023 16:52:18 -0500 Subject: [PATCH 29/37] Don't patch lazyload on TPU --- aiserver.py | 7 ++----- modeling/patches.py | 8 ++++---- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/aiserver.py b/aiserver.py index 613c735f..d5dbdfae 100644 --- a/aiserver.py +++ b/aiserver.py @@ -27,9 +27,6 @@ from ansi2html import Ansi2HTMLConverter logging.getLogger("urllib3").setLevel(logging.ERROR) -from modeling import patches -patches.patch_transformers_for_lazyload() - import attention_bias attention_bias.do_patches() @@ -10809,7 +10806,7 @@ def run(): Session(app) logger.init_ok("Flask", status="OK") logger.init("Webserver", status="Starting") - patch_transformers() + patch_transformers(use_tpu=koboldai_vars.use_colab_tpu) # Start Flask/SocketIO (Blocking, so this must be last method!) port = args.port if "port" in args and args.port is not None else 5000 @@ -10906,7 +10903,7 @@ else: logger.init("Flask", status="Starting") Session(app) logger.init_ok("Flask", status="OK") - patch_transformers() + patch_transformers(use_tpu=koboldai_vars.use_colab_tpu) startup(command_line_backend) koboldai_settings.port = args.port if "port" in args and args.port is not None else 5000 print("{0}\nServer started in WSGI mode!{1}".format(colors.GREEN, colors.END), flush=True) diff --git a/modeling/patches.py b/modeling/patches.py index b7c5370a..d8990327 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -164,7 +164,6 @@ def patch_transformers_for_lazyload() -> None: # both for short term compatibility load_in_8bit=False, is_quantized=False, - is_safetensors=False, keep_in_fp32_modules=None, ): @@ -303,9 +302,10 @@ def patch_transformers_for_lazyload() -> None: ) -def patch_transformers() -> None: +def patch_transformers(use_tpu: bool) -> None: patch_transformers_download() patch_transformers_loader() - # Doesn't do anything for TPU - patch_transformers_generation() + if not use_tpu: + patch_transformers_generation() + patch_transformers_for_lazyload() From 31a3046a189cc05984c1569888ffd3c1f1468a10 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 3 Jul 2023 17:07:18 -0500 Subject: [PATCH 30/37] Load empty modules without accelerate --- modeling/lazy_loader.py | 162 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 154 insertions(+), 8 deletions(-) diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index c14c7967..b8870cee 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -46,6 +46,7 @@ POSSIBILITY OF SUCH DAMAGE. import contextlib from functools import reduce +import itertools import time import zipfile import pickle @@ -56,13 +57,13 @@ import _codecs import os from typing import Any, Callable, Dict, Optional, Tuple, Type +from torch.nn import Module from torch.storage import UntypedStorage # Safetensors is a dependency for the local version, TPU/Colab doesn't # support it yet. try: import safetensors - HAS_SAFETENSORS = True except ModuleNotFoundError: HAS_SAFETENSORS = False @@ -70,6 +71,16 @@ except ModuleNotFoundError: import utils from logger import logger +# Accelerate is used to load with empty modules. TPU version doesn't come +# packaged with it so we use an in-house solution in that case +try: + import accelerate + HAS_ACCELERATE = True +except ModuleNotFoundError: + HAS_ACCELERATE = False + +_EXTRA_STATE_KEY_SUFFIX = "_extra_state" + # Storage of zipfile handles for each shard torch_checkpoint_file_handles = {} @@ -145,7 +156,6 @@ class TorchLazyTensor(LazyTensor): map_location=None, no_grad=True, ) -> torch.Tensor: - checkpoint = torch_checkpoint_file_handles[self.file_name] filename = os.path.basename(os.path.normpath(self.file_name)).split(".")[0] @@ -321,6 +331,116 @@ def _rebuild_tensor(lazy_storage: LazyTensor, storage_offset, shape, stride): ) return lazy_storage +# Modified version of https://github.com/pytorch/pytorch/blob/v1.11.0-rc4/torch/nn/modules/module.py#L1346-L1438 +def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, +): + for hook in self._load_state_dict_pre_hooks.values(): + hook( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + persistent_buffers = { + k: v + for k, v in self._buffers.items() + if k not in self._non_persistent_buffers_set + } + local_name_params = itertools.chain( + self._parameters.items(), persistent_buffers.items() + ) + local_state = {k: v for k, v in local_name_params if v is not None} + + for name, param in local_state.items(): + key = prefix + name + if key in state_dict: + input_param = state_dict[key] + if not torch.overrides.is_tensor_like(input_param): + error_msgs.append( + 'While copying the parameter named "{}", ' + "expected torch.Tensor or Tensor-like object from checkpoint but " + "received {}".format(key, type(input_param)) + ) + continue + + # This is used to avoid copying uninitialized parameters into + # non-lazy modules, since they dont have the hook to do the checks + # in such case, it will error when accessing the .shape attribute. + is_param_lazy = torch.nn.parameter.is_lazy(param) + # Backward compatibility: loading 1-dim tensor from 0.3.* to version 0.4+ + if ( + not is_param_lazy + and len(param.shape) == 0 + and len(input_param.shape) == 1 + ): + input_param = input_param[0] + + if not is_param_lazy and input_param.shape != param.shape: + # local shape should match the one in checkpoint + error_msgs.append( + "size mismatch for {}: copying a param with shape {} from checkpoint, " + "the shape in current model is {}.".format( + key, input_param.shape, param.shape + ) + ) + continue + try: + with torch.no_grad(): + # param.copy_(input_param) + new_param = torch.nn.Parameter( + input_param, requires_grad=param.requires_grad + ) # This line is new + if name in self._parameters: # This line is new + self._parameters[name] = new_param # This line is new + if name in persistent_buffers: # This line is new + self._buffers[name] = new_param # This line is new + except Exception as ex: + error_msgs.append( + 'While copying the parameter named "{}", ' + "whose dimensions in the model are {} and " + "whose dimensions in the checkpoint are {}, " + "an exception occurred : {}.".format( + key, param.size(), input_param.size(), ex.args + ) + ) + elif strict: + missing_keys.append(key) + + extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX + if ( + hasattr(Module, "set_extra_state") + and getattr(self.__class__, "set_extra_state", Module.set_extra_state) + is not Module.set_extra_state + ): # if getattr(self.__class__, "set_extra_state", Module.set_extra_state) is not Module.set_extra_state: + if extra_state_key in state_dict: + self.set_extra_state(state_dict[extra_state_key]) + elif strict: + missing_keys.append(extra_state_key) + elif strict and (extra_state_key in state_dict): + unexpected_keys.append(extra_state_key) + + if strict: + for key in state_dict.keys(): + if key.startswith(prefix) and key != extra_state_key: + input_name = key[len(prefix) :] + input_name = input_name.split(".", 1)[ + 0 + ] # get the name of param/buffer/child + if input_name not in self._modules and input_name not in local_state: + unexpected_keys.append(key) + def safetensors_load_tensor_independently( checkpoint_file: str, tensor_key: str, device: Any @@ -360,7 +480,6 @@ def patch_safetensors(callback): tensors[key] = None for key in tensors.keys(): - tensors[key] = SafetensorsLazyTensor( checkpoint_file=checkpoint_file, key=key, @@ -453,10 +572,29 @@ def use_lazy_load( patch_safetensors(callback) if dematerialized_modules: - import accelerate + if HAS_ACCELERATE: + init_empty_weights = accelerate.init_empty_weights() + init_empty_weights.__enter__() + else: + # TPU doesn't use accelerate package + old_linear_init = torch.nn.Linear.__init__ + old_embedding_init = torch.nn.Embedding.__init__ + old_layernorm_init = torch.nn.LayerNorm.__init__ - init_empty_weights = accelerate.init_empty_weights() - init_empty_weights.__enter__() + def linear_init(self, *args, device=None, **kwargs): + return old_linear_init(self, *args, device="meta", **kwargs) + + def embedding_init(self, *args, device=None, **kwargs): + return old_embedding_init(self, *args, device="meta", **kwargs) + + def layernorm_init(self, *args, device=None, **kwargs): + return old_layernorm_init(self, *args, device="meta", **kwargs) + + torch.nn.Linear.__init__ = linear_init + torch.nn.Embedding.__init__ = embedding_init + torch.nn.LayerNorm.__init__ = layernorm_init + old_load_from_state_dict = torch.nn.Module._load_from_state_dict + torch.nn.Module._load_from_state_dict = _load_from_state_dict with use_custom_unpickler(_LazyUnpickler): yield True @@ -471,14 +609,22 @@ def use_lazy_load( ) if dematerialized_modules: - init_empty_weights.__exit__(None, None, None) + if HAS_ACCELERATE: + init_empty_weights.__exit__(None, None, None) + else: + torch.nn.Linear.__init__ = old_linear_init + torch.nn.Embedding.__init__ = old_embedding_init + torch.nn.LayerNorm.__init__ = old_layernorm_init + torch.nn.Module._load_from_state_dict = old_load_from_state_dict def post_load_cleanup() -> None: """Close dangling file pointers and clear caches after the load is complete.""" global torch_checkpoint_file_handles - logger.debug(f"[lazy_load] CheckpointChunkCache Hit Data: {CheckpointChunkCache.hit_data}") + logger.debug( + f"[lazy_load] CheckpointChunkCache Hit Data: {CheckpointChunkCache.hit_data}" + ) CheckpointChunkCache.clear(unload_model=True) # Bar is initialized in From 1bb2d2621cec85f924a123c1fb179a79127aed6f Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 3 Jul 2023 17:12:07 -0500 Subject: [PATCH 31/37] Make TPU in line with new lazyload behavior --- tpu_mtj_backend.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py index ec69f66d..a5fd9d69 100644 --- a/tpu_mtj_backend.py +++ b/tpu_mtj_backend.py @@ -1196,6 +1196,7 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword if utils.num_shards is not None: utils.current_shard += 1 + for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)): model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None) @@ -1210,31 +1211,16 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword koboldai_vars.loaded_layers += 1 continue - storage_key = model_dict[key].key - if storage_key != last_storage_key or model_dict[key].seek_offset < current_offset: - last_storage_key = storage_key - if isinstance(f, zipfile.ZipExtFile): - f.close() - try: - f = z.open(f"archive/data/{storage_key}") - except: - f = z.open(f"{zipfolder}/data/{storage_key}") - current_offset = 0 - if current_offset != model_dict[key].seek_offset: - f.read(model_dict[key].seek_offset - current_offset) - current_offset = model_dict[key].seek_offset spec = model_spec[model_spec_key] transforms = set(spec.get("transforms", ())) + if not isinstance(model_dict[key], lazy_loader.LazyTensor): error = f"Duplicate key {repr(key)}" print("\n\nERROR: " + error, file=sys.stderr) raise RuntimeError(error) - size = functools.reduce(lambda x, y: x * y, model_dict[key].shape, 1) - dtype = model_dict[key].dtype - nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3) - tensor = model_dict[key].materialize(f, map_location="cpu") + + tensor = model_dict[key].materialize(map_location="cpu") model_dict[key] = tensor.to("meta") - current_offset += nbytes # MTJ requires certain mathematical operations to be performed # on tensors in order for them to be in the correct format From 7f869a54d8d0199e9dd0c630eb88a6833f60b5e4 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 3 Jul 2023 17:18:48 -0500 Subject: [PATCH 32/37] Just use accelerate on tpu --- modeling/lazy_loader.py | 156 ++-------------------------------------- requirements_mtj.txt | 1 + 2 files changed, 6 insertions(+), 151 deletions(-) diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index b8870cee..b61f9be6 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -56,6 +56,7 @@ import collections import _codecs import os from typing import Any, Callable, Dict, Optional, Tuple, Type +import accelerate from torch.nn import Module from torch.storage import UntypedStorage @@ -64,6 +65,7 @@ from torch.storage import UntypedStorage # support it yet. try: import safetensors + HAS_SAFETENSORS = True except ModuleNotFoundError: HAS_SAFETENSORS = False @@ -71,17 +73,6 @@ except ModuleNotFoundError: import utils from logger import logger -# Accelerate is used to load with empty modules. TPU version doesn't come -# packaged with it so we use an in-house solution in that case -try: - import accelerate - HAS_ACCELERATE = True -except ModuleNotFoundError: - HAS_ACCELERATE = False - -_EXTRA_STATE_KEY_SUFFIX = "_extra_state" - - # Storage of zipfile handles for each shard torch_checkpoint_file_handles = {} @@ -331,116 +322,6 @@ def _rebuild_tensor(lazy_storage: LazyTensor, storage_offset, shape, stride): ) return lazy_storage -# Modified version of https://github.com/pytorch/pytorch/blob/v1.11.0-rc4/torch/nn/modules/module.py#L1346-L1438 -def _load_from_state_dict( - self, - state_dict, - prefix, - local_metadata, - strict, - missing_keys, - unexpected_keys, - error_msgs, -): - for hook in self._load_state_dict_pre_hooks.values(): - hook( - state_dict, - prefix, - local_metadata, - strict, - missing_keys, - unexpected_keys, - error_msgs, - ) - - persistent_buffers = { - k: v - for k, v in self._buffers.items() - if k not in self._non_persistent_buffers_set - } - local_name_params = itertools.chain( - self._parameters.items(), persistent_buffers.items() - ) - local_state = {k: v for k, v in local_name_params if v is not None} - - for name, param in local_state.items(): - key = prefix + name - if key in state_dict: - input_param = state_dict[key] - if not torch.overrides.is_tensor_like(input_param): - error_msgs.append( - 'While copying the parameter named "{}", ' - "expected torch.Tensor or Tensor-like object from checkpoint but " - "received {}".format(key, type(input_param)) - ) - continue - - # This is used to avoid copying uninitialized parameters into - # non-lazy modules, since they dont have the hook to do the checks - # in such case, it will error when accessing the .shape attribute. - is_param_lazy = torch.nn.parameter.is_lazy(param) - # Backward compatibility: loading 1-dim tensor from 0.3.* to version 0.4+ - if ( - not is_param_lazy - and len(param.shape) == 0 - and len(input_param.shape) == 1 - ): - input_param = input_param[0] - - if not is_param_lazy and input_param.shape != param.shape: - # local shape should match the one in checkpoint - error_msgs.append( - "size mismatch for {}: copying a param with shape {} from checkpoint, " - "the shape in current model is {}.".format( - key, input_param.shape, param.shape - ) - ) - continue - try: - with torch.no_grad(): - # param.copy_(input_param) - new_param = torch.nn.Parameter( - input_param, requires_grad=param.requires_grad - ) # This line is new - if name in self._parameters: # This line is new - self._parameters[name] = new_param # This line is new - if name in persistent_buffers: # This line is new - self._buffers[name] = new_param # This line is new - except Exception as ex: - error_msgs.append( - 'While copying the parameter named "{}", ' - "whose dimensions in the model are {} and " - "whose dimensions in the checkpoint are {}, " - "an exception occurred : {}.".format( - key, param.size(), input_param.size(), ex.args - ) - ) - elif strict: - missing_keys.append(key) - - extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX - if ( - hasattr(Module, "set_extra_state") - and getattr(self.__class__, "set_extra_state", Module.set_extra_state) - is not Module.set_extra_state - ): # if getattr(self.__class__, "set_extra_state", Module.set_extra_state) is not Module.set_extra_state: - if extra_state_key in state_dict: - self.set_extra_state(state_dict[extra_state_key]) - elif strict: - missing_keys.append(extra_state_key) - elif strict and (extra_state_key in state_dict): - unexpected_keys.append(extra_state_key) - - if strict: - for key in state_dict.keys(): - if key.startswith(prefix) and key != extra_state_key: - input_name = key[len(prefix) :] - input_name = input_name.split(".", 1)[ - 0 - ] # get the name of param/buffer/child - if input_name not in self._modules and input_name not in local_state: - unexpected_keys.append(key) - def safetensors_load_tensor_independently( checkpoint_file: str, tensor_key: str, device: Any @@ -572,29 +453,8 @@ def use_lazy_load( patch_safetensors(callback) if dematerialized_modules: - if HAS_ACCELERATE: - init_empty_weights = accelerate.init_empty_weights() - init_empty_weights.__enter__() - else: - # TPU doesn't use accelerate package - old_linear_init = torch.nn.Linear.__init__ - old_embedding_init = torch.nn.Embedding.__init__ - old_layernorm_init = torch.nn.LayerNorm.__init__ - - def linear_init(self, *args, device=None, **kwargs): - return old_linear_init(self, *args, device="meta", **kwargs) - - def embedding_init(self, *args, device=None, **kwargs): - return old_embedding_init(self, *args, device="meta", **kwargs) - - def layernorm_init(self, *args, device=None, **kwargs): - return old_layernorm_init(self, *args, device="meta", **kwargs) - - torch.nn.Linear.__init__ = linear_init - torch.nn.Embedding.__init__ = embedding_init - torch.nn.LayerNorm.__init__ = layernorm_init - old_load_from_state_dict = torch.nn.Module._load_from_state_dict - torch.nn.Module._load_from_state_dict = _load_from_state_dict + init_empty_weights = accelerate.init_empty_weights() + init_empty_weights.__enter__() with use_custom_unpickler(_LazyUnpickler): yield True @@ -609,13 +469,7 @@ def use_lazy_load( ) if dematerialized_modules: - if HAS_ACCELERATE: - init_empty_weights.__exit__(None, None, None) - else: - torch.nn.Linear.__init__ = old_linear_init - torch.nn.Embedding.__init__ = old_embedding_init - torch.nn.LayerNorm.__init__ = old_layernorm_init - torch.nn.Module._load_from_state_dict = old_load_from_state_dict + init_empty_weights.__exit__(None, None, None) def post_load_cleanup() -> None: diff --git a/requirements_mtj.txt b/requirements_mtj.txt index e3f91282..b3521d03 100644 --- a/requirements_mtj.txt +++ b/requirements_mtj.txt @@ -34,3 +34,4 @@ ijson ftfy pydub sentencepiece +accelerate==0.18.0 \ No newline at end of file From 32917fd651cdd86e791ba80f6e3deacb405a00e7 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 3 Jul 2023 17:51:54 -0500 Subject: [PATCH 33/37] Remove unused file open --- tpu_mtj_backend.py | 211 ++++++++++++++++++++++----------------------- 1 file changed, 102 insertions(+), 109 deletions(-) diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py index a5fd9d69..5a5271e2 100644 --- a/tpu_mtj_backend.py +++ b/tpu_mtj_backend.py @@ -1171,125 +1171,118 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword if callback.nested: return callback.nested = True - with zipfile.ZipFile(f, "r") as z: - try: - last_storage_key = None - zipfolder = os.path.basename(os.path.normpath(f)).split('.')[0] - f = None - current_offset = 0 - if utils.current_shard == 0: - print("\n\n\nThis model has ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), " parameters.\n") - - if utils.num_shards is None or utils.current_shard == 0: - if utils.num_shards is not None: - num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs)) - else: - num_tensors = len(model_dict) - - if socketio is None: - utils.bar = tqdm(total=num_tensors, desc="Loading model tensors") - else: - utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit)) - koboldai_vars.status_message = "Loading model" - koboldai_vars.loaded_layers = 0 - koboldai_vars.total_layers = num_tensors + try: + if utils.current_shard == 0: + print("\n\n\nThis model has ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), " parameters.\n") + if utils.num_shards is None or utils.current_shard == 0: if utils.num_shards is not None: - utils.current_shard += 1 + num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs)) + else: + num_tensors = len(model_dict) - for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)): - model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None) + if socketio is None: + utils.bar = tqdm(total=num_tensors, desc="Loading model tensors") + else: + utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit)) + koboldai_vars.status_message = "Loading model" + koboldai_vars.loaded_layers = 0 + koboldai_vars.total_layers = num_tensors - # Some model weights are used by transformers but not by MTJ. - # We have to materialize these weights anyways because - # transformers will throw a tantrum otherwise. To attain - # the least possible memory usage, we create them as meta - # tensors, which don't take up any actual CPU or TPU memory. - if model_spec_key is None: - model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta") - utils.bar.update(1) - koboldai_vars.loaded_layers += 1 - continue + if utils.num_shards is not None: + utils.current_shard += 1 - spec = model_spec[model_spec_key] - transforms = set(spec.get("transforms", ())) + for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)): + model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None) - if not isinstance(model_dict[key], lazy_loader.LazyTensor): - error = f"Duplicate key {repr(key)}" - print("\n\nERROR: " + error, file=sys.stderr) - raise RuntimeError(error) - - tensor = model_dict[key].materialize(map_location="cpu") - model_dict[key] = tensor.to("meta") - - # MTJ requires certain mathematical operations to be performed - # on tensors in order for them to be in the correct format - if "remove_first_two_rows" in transforms: - tensor = tensor[2:] - if "divide_by_shards" in transforms: - tensor /= params["cores_per_replica"] - if "vocab_pad" in transforms: - tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],)) - # We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config - #if "no_transpose" not in transforms and tensor.ndim == 2: - # tensor = tensor.T - tensor.unsqueeze_(0) - - - # Shard the tensor so that parts of the tensor can be used - # on different TPU cores - tensor = reshard_reverse( - tensor, - params["cores_per_replica"], - network.state["params"][spec["module"]][spec["param"]].shape, - ) - tensor = tensor.detach() - # numpy does not support bfloat16 - if tensor.dtype is torch.bfloat16: - tensor = tensor.to(torch.float32) - tensor = jnp.array(tensor) - if tensor.dtype is torch.float16 or tensor.dtype is torch.float32: - tensor = tensor.bfloat16() - network.state["params"][spec["module"]][spec["param"]] = move_xmap( - tensor, - np.empty(params["cores_per_replica"]), - ) - - koboldai_vars.loaded_layers += 1 - try: - time.sleep(0.01) - except: - pass + # Some model weights are used by transformers but not by MTJ. + # We have to materialize these weights anyways because + # transformers will throw a tantrum otherwise. To attain + # the least possible memory usage, we create them as meta + # tensors, which don't take up any actual CPU or TPU memory. + if model_spec_key is None: + model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta") utils.bar.update(1) + koboldai_vars.loaded_layers += 1 + continue - if utils.num_shards is not None and utils.current_shard < utils.num_shards: - return + spec = model_spec[model_spec_key] + transforms = set(spec.get("transforms", ())) - # Check for tensors that MTJ needs that were not provided in the - # HF model - for mk, mv in network.state["params"].items(): - for pk, pv in mv.items(): - if isinstance(pv, PlaceholderTensor): - # The transformers GPT-J models apparently do not - # have embedding bias, whereas MTJ GPT-J models do, - # so we have to supplement an embedding bias tensor - # by creating a tensor with the necessary shape, filled - # with zeros. - if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b": - mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"])) + if not isinstance(model_dict[key], lazy_loader.LazyTensor): + error = f"Duplicate key {repr(key)}" + print("\n\nERROR: " + error, file=sys.stderr) + raise RuntimeError(error) - else: - error = f"{mk} {pk} could not be found in the model checkpoint" - print("\n\nERROR: " + error, file=sys.stderr) - raise RuntimeError(error) - finally: - if utils.num_shards is None or utils.current_shard >= utils.num_shards: - utils.bar.close() - utils.bar = None - koboldai_vars.status_message = "" - callback.nested = False - if isinstance(f, zipfile.ZipExtFile): - f.close() + tensor = model_dict[key].materialize(map_location="cpu") + model_dict[key] = tensor.to("meta") + + # MTJ requires certain mathematical operations to be performed + # on tensors in order for them to be in the correct format + if "remove_first_two_rows" in transforms: + tensor = tensor[2:] + if "divide_by_shards" in transforms: + tensor /= params["cores_per_replica"] + if "vocab_pad" in transforms: + tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],)) + # We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config + #if "no_transpose" not in transforms and tensor.ndim == 2: + # tensor = tensor.T + tensor.unsqueeze_(0) + + + # Shard the tensor so that parts of the tensor can be used + # on different TPU cores + tensor = reshard_reverse( + tensor, + params["cores_per_replica"], + network.state["params"][spec["module"]][spec["param"]].shape, + ) + tensor = tensor.detach() + # numpy does not support bfloat16 + if tensor.dtype is torch.bfloat16: + tensor = tensor.to(torch.float32) + tensor = jnp.array(tensor) + if tensor.dtype is torch.float16 or tensor.dtype is torch.float32: + tensor = tensor.bfloat16() + network.state["params"][spec["module"]][spec["param"]] = move_xmap( + tensor, + np.empty(params["cores_per_replica"]), + ) + + koboldai_vars.loaded_layers += 1 + try: + time.sleep(0.01) + except: + pass + utils.bar.update(1) + + if utils.num_shards is not None and utils.current_shard < utils.num_shards: + return + + # Check for tensors that MTJ needs that were not provided in the + # HF model + for mk, mv in network.state["params"].items(): + for pk, pv in mv.items(): + if isinstance(pv, PlaceholderTensor): + # The transformers GPT-J models apparently do not + # have embedding bias, whereas MTJ GPT-J models do, + # so we have to supplement an embedding bias tensor + # by creating a tensor with the necessary shape, filled + # with zeros. + if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b": + mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"])) + + else: + error = f"{mk} {pk} could not be found in the model checkpoint" + print("\n\nERROR: " + error, file=sys.stderr) + raise RuntimeError(error) + finally: + if utils.num_shards is None or utils.current_shard >= utils.num_shards: + utils.bar.close() + utils.bar = None + koboldai_vars.status_message = "" + callback.nested = False callback.nested = False if os.path.isdir(koboldai_vars.model.replace('/', '_')): From 59c731f805ca15b8b97ce48315a65d9ec56b56a9 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 3 Jul 2023 18:37:48 -0500 Subject: [PATCH 34/37] Fix static primary_device and some small cleanup --- modeling/inference_models/hf_torch.py | 65 ++++++++++++++------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 2c035e62..9e7710fc 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -42,22 +42,24 @@ from modeling.inference_model import ( # scores for each token. LOG_SAMPLER_NO_EFFECT = False + class BreakmodelConfig: def __init__(self) -> None: self.disk_blocks = 0 self.gpu_blocks = [] - self.primary_device = 0 if torch.cuda.device_count() > 0 else "cpu" + @property + def primary_device(self): + if utils.args.cpu: + return "cpu" + elif not sum(self.gpu_blocks): + # No blocks are on GPU + return "cpu" + elif torch.cuda.device_count() <= 0: + return "cpu" + return 0 def get_device_map(self, model: nn.Module) -> dict: - if ( - # Explicitly CPU-only - utils.args.cpu - # No blocks are on GPU - or not sum(self.gpu_blocks) - ): - self.primary_device = "cpu" - ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks) cumulative_gpu_blocks = tuple(itertools.accumulate(self.gpu_blocks)) device_map = {} @@ -171,7 +173,6 @@ class HFTorchInferenceModel(HFInferenceModel): return "Unknown" def _post_load(m_self) -> None: - if not utils.koboldai_vars.model_type: utils.koboldai_vars.model_type = m_self.get_model_type() @@ -329,11 +330,15 @@ class HFTorchInferenceModel(HFInferenceModel): metamodel = AutoModelForCausalLM.from_config(self.model_config) if utils.args.cpu: cpu_map = {name: "cpu" for name in utils.layers_module_names} - for name in utils.get_missing_module_names(metamodel, list(cpu_map.keys())): + for name in utils.get_missing_module_names( + metamodel, list(cpu_map.keys()) + ): cpu_map[name] = "cpu" tf_kwargs["device_map"] = cpu_map else: - tf_kwargs["device_map"] = self.breakmodel_config.get_device_map(metamodel) + tf_kwargs["device_map"] = self.breakmodel_config.get_device_map( + metamodel + ) with lazy_loader.use_lazy_load( enable=self.lazy_load, @@ -434,20 +439,6 @@ class HFTorchInferenceModel(HFInferenceModel): Embedding.__call__ = new_embedding_call Embedding._koboldai_patch_causallm_model = self.model - @contextlib.contextmanager - def _maybe_use_float16(self, always_use: bool = False): - if always_use or ( - utils.koboldai_vars.hascuda - and self.low_mem - and (self.usegpu or self.breakmodel) - ): - original_dtype = torch.get_default_dtype() - torch.set_default_dtype(torch.float16) - yield True - torch.set_default_dtype(original_dtype) - else: - yield False - def breakmodel_device_list(self, n_layers, primary=None, selected=None): device_count = torch.cuda.device_count() if device_count < 2: @@ -484,7 +475,11 @@ class HFTorchInferenceModel(HFInferenceModel): def breakmodel_device_config(self, config): n_layers = utils.num_layers(config) - logger.debug("gpu blocks before modification: {}".format(self.breakmodel_config.gpu_blocks)) + logger.debug( + "gpu blocks before modification: {}".format( + self.breakmodel_config.gpu_blocks + ) + ) if utils.args.cpu: self.breakmodel_config.gpu_blocks = [0] * n_layers @@ -510,21 +505,29 @@ class HFTorchInferenceModel(HFInferenceModel): n_layers -= self.breakmodel_config.disk_blocks logger.init_ok("Final device configuration:", status="Info") - self.breakmodel_device_list(n_layers, primary=self.breakmodel_config.primary_device) + self.breakmodel_device_list( + n_layers, primary=self.breakmodel_config.primary_device + ) with open( "settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w" ) as file: file.write( "{}\n{}".format( - ",".join(map(str, self.breakmodel_config.gpu_blocks)), self.breakmodel_config.disk_blocks + ",".join(map(str, self.breakmodel_config.gpu_blocks)), + self.breakmodel_config.disk_blocks, ) ) # If all layers are on the same device, use the old GPU generation mode - while len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[-1] == 0: + while ( + len(self.breakmodel_config.gpu_blocks) + and self.breakmodel_config.gpu_blocks[-1] == 0 + ): self.breakmodel_config.gpu_blocks.pop() self.breakmodel = True - if len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[-1] in ( + if len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[ + -1 + ] in ( -1, utils.num_layers(config), ): From 6f7e6422ef5fd038a561ab28b9c49f8d6289001f Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 3 Jul 2023 19:04:48 -0500 Subject: [PATCH 35/37] Actually get correct primary device --- modeling/inference_models/hf_torch.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 9e7710fc..108887f6 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -57,6 +57,10 @@ class BreakmodelConfig: return "cpu" elif torch.cuda.device_count() <= 0: return "cpu" + + for device_index, blocks in enumerate(self.gpu_blocks): + if blocks: + return device_index return 0 def get_device_map(self, model: nn.Module) -> dict: From bce1a907e57ba331b7379489cacfd1f18c4f60a8 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 3 Jul 2023 19:36:31 -0500 Subject: [PATCH 36/37] Update aux device to depend on primary device --- aiserver.py | 2 +- modeling/inference_model.py | 12 +++++++++++- modeling/inference_models/hf_torch.py | 5 ++++- utils.py | 11 ----------- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/aiserver.py b/aiserver.py index d5dbdfae..05ef8a21 100644 --- a/aiserver.py +++ b/aiserver.py @@ -3777,7 +3777,7 @@ def calcsubmit(txt): bias[i] = b["multiplier"] - device = utils.get_auxilary_device() + device = model.get_auxilary_device() attention_bias.attention_bias = torch.Tensor(bias).to(device) logger.info(f"Bias by {koboldai_vars.memory_attn_bias} -- {attention_bias.attention_bias}") logger.debug("Submit: experimental_features time {}s".format(time.time()-start_time)) diff --git a/modeling/inference_model.py b/modeling/inference_model.py index 491d2b05..bc11ec11 100644 --- a/modeling/inference_model.py +++ b/modeling/inference_model.py @@ -182,6 +182,16 @@ class InferenceModel: setattr(self, parameter, parameters[parameter]) return + def get_auxilary_device(self) -> Union[str, int, torch.device]: + """Get device auxilary tensors like inputs should be stored on.""" + + # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU. + if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu: + return utils.koboldai_vars.gpu_device + elif utils.koboldai_vars.hascuda: + return "cuda" + return "cpu" + def load(self, save_model: bool = False, initial_load: bool = False) -> None: """User-facing load function. Do not override this; try `_load()` instead.""" @@ -301,7 +311,7 @@ class InferenceModel: ) start_time = time.time() - gen_in = gen_in.to(utils.get_auxilary_device()) + gen_in = gen_in.to(self.get_auxilary_device()) logger.debug( "core_generate: gen_in to device time {}s".format(time.time() - start_time) diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 108887f6..fb52bac1 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -122,6 +122,9 @@ class HFTorchInferenceModel(HFInferenceModel): return ret + def get_auxilary_device(self) -> Union[str, int, torch.device]: + return self.breakmodel_config.primary_device + def _get_target_dtype(self) -> Union[torch.float16, torch.float32]: if self.breakmodel_config.primary_device == "cpu": return torch.float32 @@ -278,7 +281,7 @@ class HFTorchInferenceModel(HFInferenceModel): if not self.usegpu and not self.breakmodel: gen_in = gen_in.to("cpu") else: - device = utils.get_auxilary_device() + device = self.get_auxilary_device() gen_in = gen_in.to(device) additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else [] diff --git a/utils.py b/utils.py index 863bda2f..e7e20c95 100644 --- a/utils.py +++ b/utils.py @@ -650,17 +650,6 @@ class UIProgressBarFile(object): def flush(self): pass -def get_auxilary_device(): - """Get device auxilary tensors like inputs should be stored on.""" - - # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU. - if koboldai_vars.hascuda and koboldai_vars.usegpu: - return koboldai_vars.gpu_device - elif koboldai_vars.hascuda: - # TODO: Primary device - return "cuda" - return "cpu" - #==================================================================# # Strips submitted text from the text returned by the AI #==================================================================# From ebe478458f7e3052ed8fa020b8accc22c6ee4533 Mon Sep 17 00:00:00 2001 From: somebody Date: Mon, 3 Jul 2023 19:59:53 -0500 Subject: [PATCH 37/37] Just use string sort on key Doesn't seem to affect cache hit number --- modeling/patches.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modeling/patches.py b/modeling/patches.py index d8990327..827e997a 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -196,8 +196,7 @@ def patch_transformers_for_lazyload() -> None: # State dict must be ordered in this manner to make the caching in # lazy_loader.py effective key=lambda x: ( - # NOTE: Assuming key is just decimal - int(x[1].key), + x[1].key, x[1].seek_offset, ), ):