diff --git a/aiserver.py b/aiserver.py index 75cba06f..49223b3a 100644 --- a/aiserver.py +++ b/aiserver.py @@ -1525,6 +1525,7 @@ def general_startup(override_args=None): print(f"Allowed IPs: {allowed_ips}") if args.cpu: + os.environ['CUDA_VISIBLE_DEVICES'] = "None" koboldai_vars.use_colab_tpu = False koboldai_vars.hascuda = False koboldai_vars.usegpu = False @@ -10811,7 +10812,7 @@ def run(): Session(app) logger.init_ok("Flask", status="OK") logger.init("Webserver", status="Starting") - patch_transformers() + patch_transformers(use_tpu=koboldai_vars.use_colab_tpu) # Start Flask/SocketIO (Blocking, so this must be last method!) port = args.port if "port" in args and args.port is not None else 5000 @@ -10908,7 +10909,7 @@ else: logger.init("Flask", status="Starting") Session(app) logger.init_ok("Flask", status="OK") - patch_transformers() + patch_transformers(use_tpu=koboldai_vars.use_colab_tpu) startup(command_line_backend) koboldai_settings.port = args.port if "port" in args and args.port is not None else 5000 print("{0}\nServer started in WSGI mode!{1}".format(colors.GREEN, colors.END), flush=True) diff --git a/breakmodel.py b/breakmodel.py deleted file mode 100644 index 75bc03cc..00000000 --- a/breakmodel.py +++ /dev/null @@ -1,955 +0,0 @@ -''' -This is a MODIFIED version of arrmansa's low VRAM patch. -https://github.com/arrmansa/Basic-UI-for-GPT-J-6B-with-low-vram/blob/main/GPT-J-6B-Low-Vram-UI.ipynb -The ORIGINAL version of the patch is released under the Apache License 2.0 -Copyright 2021 arrmansa -Copyright 2021 finetuneanon -Copyright 2018, 2022 The Hugging Face team - - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -''' - - -import torch -from torch import nn -import torch.cuda.comm -import copy -import gc -import os -import sys -import itertools -import bisect -import random -import utils -from typing import Dict, List, Optional, Union - -from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPastAndCrossAttentions - -from transformers.utils import logging -logger = logging.get_logger(__name__) - - -breakmodel = True -gpu_blocks = [] -disk_blocks = 0 -primary_device = 0 if torch.cuda.device_count() > 0 else "cpu" - -from accelerate.hooks import attach_align_device_hook_on_blocks -from accelerate.utils import OffloadedWeightsLoader, check_device_map, extract_submodules_state_dict, offload_state_dict -from accelerate import dispatch_model - -def dispatch_model_ex( - model: nn.Module, - device_map: Dict[str, Union[str, int, torch.device]], - main_device: Optional[torch.device] = None, - state_dict: Optional[Dict[str, torch.Tensor]] = None, - offload_dir: Union[str, os.PathLike] = None, - offload_buffers: bool = False, - **kwargs, -): - """ - This is a modified version of - https://github.com/huggingface/accelerate/blob/eeaba598f455fbd2c48661d7e816d3ff25ab050b/src/accelerate/big_modeling.py#L130 - that still works when the main device is the CPU. - - Dispatches a model according to a given device map. Layers of the model might be spread across GPUs, offloaded on - the CPU or even the disk. - - Args: - model (`torch.nn.Module`): - The model to dispatch. - device_map (`Dict[str, Union[str, int, torch.device]]`): - A dictionary mapping module names in the models `state_dict` to the device they should go to. Note that - `"disk"` is accepted even if it's not a proper value for `torch.device`. - main_device (`str`, `int` or `torch.device`, *optional*): - The main execution device. Will default to the first device in the `device_map` different from `"cpu"` or - `"disk"`. - state_dict (`Dict[str, torch.Tensor]`, *optional*): - The state dict of the part of the model that will be kept on CPU. - offload_dir (`str` or `os.PathLike`): - The folder in which to offload the model weights (or where the model weights are already offloaded). - offload_buffers (`bool`, *optional*, defaults to `False`): - Whether or not to offload the buffers with the model parameters. - preload_module_classes (`List[str]`, *optional*): - A list of classes whose instances should load all their weights (even in the submodules) at the beginning - of the forward. This should only be used for classes that have submodules which are registered but not - called directly during the forward, for instance if a `dense` linear layer is registered, but at forward, - `dense.weight` and `dense.bias` are used in some operations instead of calling `dense` directly. - """ - if main_device != "cpu": - return dispatch_model(model, device_map, main_device, state_dict, offload_dir=offload_dir, offload_buffers=offload_buffers, **kwargs) - - # Error early if the device map is incomplete. - check_device_map(model, device_map) - - offload_devices = ["cpu", "disk"] if main_device != "cpu" else ["disk"] - - if main_device is None: - main_device = [d for d in device_map.values() if d not in offload_devices][0] - - cpu_modules = [name for name, device in device_map.items() if device == "cpu"] if main_device != "cpu" else [] - if state_dict is None and len(cpu_modules) > 0: - state_dict = extract_submodules_state_dict(model.state_dict(), cpu_modules) - - disk_modules = [name for name, device in device_map.items() if device == "disk"] - if offload_dir is None and len(disk_modules) > 0: - raise ValueError( - "We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules " - f"need to be offloaded: {', '.join(disk_modules)}." - ) - if len(disk_modules) > 0 and ( - not os.path.isdir(offload_dir) or not os.path.isfile(os.path.join(offload_dir, "index.json")) - ): - disk_state_dict = extract_submodules_state_dict(model.state_dict(), disk_modules) - offload_state_dict(offload_dir, disk_state_dict) - - execution_device = { - name: main_device if device in offload_devices else device for name, device in device_map.items() - } - offload = {name: device in offload_devices for name, device in device_map.items()} - save_folder = offload_dir if len(disk_modules) > 0 else None - if state_dict is not None or save_folder is not None: - weights_map = OffloadedWeightsLoader(state_dict=state_dict, save_folder=save_folder) - else: - weights_map = None - - attach_align_device_hook_on_blocks( - model, - execution_device=execution_device, - offload=offload, - offload_buffers=offload_buffers, - weights_map=weights_map, - **kwargs, - ) - model.hf_device_map = device_map - return model - - -# Copied from transformers.models.bart.modeling_bart._expand_mask -def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): - """ - Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. - """ - bsz, src_len = mask.size() - tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) - - inverted_mask = 1.0 - expanded_mask - - return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) - - -def move_hidden_layers(transformer, h=None): - if h is None: - h = transformer.h - - assert len(gpu_blocks) <= torch.cuda.device_count() - assert sum(gpu_blocks) <= len(h) - ram_blocks = len(h) - sum(gpu_blocks) - - transformer.extrastorage = {} - torch.cuda.empty_cache() - - able_to_pin_layers = True - for i in range(ram_blocks): - h[i].to("cpu") - transformer.extrastorage[i] = copy.deepcopy(h[i]) - smalltensor = torch.tensor(0).to(primary_device) - for param1 in h[i].parameters(): - param1.data = smalltensor - h[i].to(primary_device) - for param in transformer.extrastorage[i].parameters(): - param.requires_grad = False - param.data = param.data.detach() - if able_to_pin_layers: - try: - param.data = param.data.pin_memory() - except: - able_to_pin_layers = False - print(f"WARNING: You only have enough shared GPU memory for {i} out of {ram_blocks} CPU layers. Expect suboptimal speed.", file=sys.stderr) - gc.collect() - torch.cuda.empty_cache() - - if ram_blocks: - for param1,param2 in zip(h[0].parameters(),transformer.extrastorage[0].parameters()): - param1.data = param2.data.to(primary_device, non_blocking=False).detach() - - for param1,param2 in zip(h[ram_blocks-1].parameters(),transformer.extrastorage[ram_blocks-1].parameters()): - param1.data = param2.data.to(primary_device, non_blocking=False).detach() - - i = ram_blocks - for j in range(len(gpu_blocks)): - for _ in range(gpu_blocks[j]): - h[i].to(j) - i += 1 - - -def new_forward_neo( - self, - input_ids=None, - past_key_values=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - embs=None, -): - assert len(gpu_blocks) <= torch.cuda.device_count() - assert sum(gpu_blocks) <= len(self.h) - ram_blocks = len(self.h) - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - batch_size = input_ids.shape[0] - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - batch_size = inputs_embeds.shape[0] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - device = input_ids.device if input_ids is not None else inputs_embeds.device - - if token_type_ids is not None: - token_type_ids = token_type_ids.view(-1, input_shape[-1]) - if position_ids is not None: - position_ids = position_ids.view(-1, input_shape[-1]) - - if past_key_values is None: - past_length = 0 - past_key_values = tuple([None] * len(self.h)) - else: - past_length = past_key_values[0][0].size(-2) - - device = primary_device if breakmodel else input_ids.device if input_ids is not None else inputs_embeds.device - if position_ids is None: - position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) - position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) - - # Attention mask. - if attention_mask is not None: - assert batch_size > 0, "batch_size has to be defined and > 0" - attention_mask = attention_mask.view(batch_size, -1) - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # this attention mask is more simple than the triangular masking of causal attention - # used in OpenAI GPT, we just need to prepare the broadcast dimension here. - attention_mask = attention_mask[:, None, None, :] - - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and -10000.0 for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility - attention_mask = (1.0 - attention_mask) * -10000.0 - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x num_heads x N x N - # head_mask has shape n_layer x batch x num_heads x N x N - head_mask = self.get_head_mask(head_mask, getattr(self.config, "num_layers", None) or self.config.n_layer) - - if inputs_embeds is None: - if breakmodel: - input_ids = input_ids.to(primary_device) - inputs_embeds = self.wte(input_ids) - - if embs is not None and not (use_cache is not None and use_cache and past_key_values is not None and len(past_key_values) > 0 and past_key_values[0] is not None): - offset = 0 - for pos, emb in embs: - pos += offset - if len(emb.shape) == 2: - emb = emb.repeat(input_shape[0], 1, 1) - inputs_embeds[:, pos:pos+emb.shape[1]] = emb - offset += emb.shape[1] - - if getattr(self, "wpe", None) is None: - hidden_states = inputs_embeds - else: - if breakmodel: - position_ids = position_ids.to(primary_device) - position_embeds = self.wpe(position_ids) - if breakmodel: - position_embeds = position_embeds.to(primary_device) - hidden_states = inputs_embeds + position_embeds - - if token_type_ids is not None: - token_type_embeds = self.wte(token_type_ids) - hidden_states = hidden_states + token_type_embeds - - hidden_states = self.drop(hidden_states) - - output_shape = input_shape + (hidden_states.size(-1),) - - presents = () if use_cache else None - all_self_attentions = () if output_attentions else None - all_hidden_states = () if output_hidden_states else None - - if breakmodel and ram_blocks: - copystream = torch.cuda.Stream(device=primary_device, priority=-1) - - for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - - if breakmodel: - if i in range(ram_blocks): - index1 = (i+1)%ram_blocks - for param1,param2 in zip(self.h[index1].parameters(),self.h[(i-1)%ram_blocks].parameters()): - param1.data = param2.data - for param1,param2 in zip(self.h[index1].parameters(),self.extrastorage[index1].parameters()): - with torch.cuda.stream(copystream): - torch.cuda.comm.broadcast(param2.data,out = [param1.data]) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states.cpu(),) - - if getattr(self.config, "gradient_checkpointing", False) and self.training: - - if use_cache: - logger.warning( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - def create_custom_forward(module): - def custom_forward(*inputs): - # None for past_key_value - return module(*inputs, use_cache, output_attentions) - - return custom_forward - - outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(block), - hidden_states, - None, - attention_mask, - head_mask[i], - ) - else: - if breakmodel: - device = primary_device if i < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, i - ram_blocks) - outputs = block( - hidden_states.to(device) if breakmodel and hidden_states is not None else hidden_states, - layer_past=tuple(v.to(device) for v in layer_past if v is not None) if breakmodel and layer_past is not None and i >= ram_blocks and len(layer_past) and layer_past[0].device.index != device else layer_past, - attention_mask=attention_mask.to(device) if breakmodel and attention_mask is not None else attention_mask, - head_mask=head_mask[i].to(device) if breakmodel and head_mask[i] is not None else head_mask[i], - use_cache=use_cache, - output_attentions=output_attentions, - ) - - hidden_states = outputs[0] - if use_cache is True: - presents = presents + (outputs[1],) - - if output_attentions: - all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) - - - if breakmodel: - if i in range(ram_blocks): - torch.cuda.synchronize() - torch.cuda.empty_cache() - - if breakmodel: - if ram_blocks: - del copystream - torch.cuda.empty_cache() - hidden_states = hidden_states.to(primary_device) - hidden_states = self.ln_f(hidden_states) - if breakmodel: - hidden_states = hidden_states.to(primary_device) - - hidden_states = hidden_states.view(*output_shape) - # Add last hidden state - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=presents, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - ) - - -def new_forward_xglm( - self, - input_ids=None, - attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - head_mask=None, - cross_attn_head_mask=None, - past_key_values=None, - inputs_embeds=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, -): - assert len(gpu_blocks) <= torch.cuda.device_count() - assert sum(gpu_blocks) <= len(self.layers) - ram_blocks = len(self.layers) - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - # past_key_values_length - past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 - - if inputs_embeds is None: - if breakmodel: - input_ids = input_ids.to(primary_device) - inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale - - attention_mask = self._prepare_decoder_attention_mask( - attention_mask, input_shape, inputs_embeds, past_key_values_length - ) - - # expand encoder attention mask - if encoder_hidden_states is not None and encoder_attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) - - # embed positions - if breakmodel: - inputs_embeds = inputs_embeds.to(primary_device) - positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length) - if breakmodel: - positions = positions.to(primary_device) - - hidden_states = inputs_embeds + positions - - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None - next_decoder_cache = () if use_cache else None - - if breakmodel and ram_blocks: - copystream = torch.cuda.Stream(device=primary_device, priority=-1) - - # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired - for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): - if attn_mask is not None: - assert attn_mask.size()[0] == ( - len(self.layers) - ), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." - for idx, decoder_layer in enumerate(self.layers): - i = idx - if breakmodel: - if i in range(ram_blocks): - index1 = (i+1)%ram_blocks - for param1,param2 in zip(self.layers[index1].parameters(),self.layers[(i-1)%ram_blocks].parameters()): - param1.data = param2.data - for param1,param2 in zip(self.layers[index1].parameters(),self.extrastorage[index1].parameters()): - with torch.cuda.stream(copystream): - torch.cuda.comm.broadcast(param2.data,out = [param1.data]) - - # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - if output_hidden_states: - all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) - if self.training and (dropout_probability < self.layerdrop): - continue - - past_key_value = past_key_values[idx] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - - if use_cache: - logger.warning( - "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`..." - ) - use_cache = False - - def create_custom_forward(module): - def custom_forward(*inputs): - # None for past_key_value - return module(*inputs, output_attentions, use_cache) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(decoder_layer), - hidden_states, - attention_mask, - encoder_hidden_states, - encoder_attention_mask, - head_mask[idx] if head_mask is not None else None, - cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, - None, - ) - else: - if breakmodel: - device = primary_device if i < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, i - ram_blocks) - layer_outputs = decoder_layer( - hidden_states.to(device) if breakmodel and hidden_states is not None else hidden_states, - attention_mask=attention_mask.to(device) if breakmodel and attention_mask is not None else attention_mask, - encoder_hidden_states=encoder_hidden_states.to(device) if breakmodel and encoder_hidden_states is not None else encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask.to(device) if breakmodel and encoder_attention_mask is not None else encoder_attention_mask, - layer_head_mask=((head_mask[idx].to(device) if breakmodel and head_mask[idx] is not None else head_mask[idx]) if head_mask is not None else None), - cross_attn_layer_head_mask=( - (cross_attn_head_mask[idx].to(device) if breakmodel and cross_attn_head_mask[idx] is not None else cross_attn_head_mask[idx]) if cross_attn_head_mask is not None else None - ), - past_key_value=tuple(v.to(device) for v in past_key_value if v is not None) if breakmodel and past_key_value is not None and i >= ram_blocks and len(past_key_value) and past_key_value[0].device.index != device else past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - if encoder_hidden_states is not None: - all_cross_attentions += (layer_outputs[2],) - - if breakmodel: - if i in range(ram_blocks): - torch.cuda.synchronize() - torch.cuda.empty_cache() - - if breakmodel: - if ram_blocks: - del copystream - torch.cuda.empty_cache() - hidden_states = hidden_states.to(primary_device) - hidden_states = self.layer_norm(hidden_states) - if breakmodel: - hidden_states = hidden_states.to(primary_device) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = next_decoder_cache if use_cache else None - if not return_dict: - return tuple( - v - for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] - if v is not None - ) - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - cross_attentions=all_cross_attentions, - ) - - -def new_forward_opt( - self, - input_ids=None, - attention_mask=None, - head_mask=None, - past_key_values=None, - inputs_embeds=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, -): - assert len(gpu_blocks) <= torch.cuda.device_count() - assert sum(gpu_blocks) <= len(self.layers) - ram_blocks = len(self.layers) - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 - - if inputs_embeds is None: - if breakmodel: - input_ids = input_ids.to(primary_device) - inputs_embeds = self.embed_tokens(input_ids) - - # embed positions - if breakmodel: - inputs_embeds = inputs_embeds.to(primary_device) - if attention_mask is None: - attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device) - - positions = self.embed_positions(attention_mask)[:, past_key_values_length:, :] - if breakmodel: - positions = positions.to(primary_device) - - attention_mask = self._prepare_decoder_attention_mask( - attention_mask, input_shape, inputs_embeds, past_key_values_length - ) - - if self.project_in is not None: - inputs_embeds = self.project_in(inputs_embeds) - - hidden_states = inputs_embeds + positions - - hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = () if use_cache else None - - if breakmodel and ram_blocks: - copystream = torch.cuda.Stream(device=primary_device, priority=-1) - - # check if head_mask has a correct number of layers specified if desired - for attn_mask, mask_name in zip([head_mask], ["head_mask"]): - if attn_mask is not None: - if attn_mask.size()[0] != (len(self.layers)): - raise ValueError( - f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" - f" {head_mask.size()[0]}." - ) - - for idx, decoder_layer in enumerate(self.layers): - i = idx - if breakmodel: - if i in range(ram_blocks): - index1 = (i+1)%ram_blocks - for param1,param2 in zip(self.layers[index1].parameters(),self.layers[(i-1)%ram_blocks].parameters()): - param1.data = param2.data - for param1,param2 in zip(self.layers[index1].parameters(),self.extrastorage[index1].parameters()): - with torch.cuda.stream(copystream): - torch.cuda.comm.broadcast(param2.data,out = [param1.data]) - - # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - if output_hidden_states: - all_hidden_states += (hidden_states,) - dropout_probability = random.uniform(0, 1) - if self.training and (dropout_probability < self.layerdrop): - continue - - past_key_value = past_key_values[idx] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - - if use_cache: - logger.warning( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - def create_custom_forward(module): - def custom_forward(*inputs): - # None for past_key_value - return module(*inputs, output_attentions, None) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(decoder_layer), - hidden_states, - attention_mask, - head_mask[idx] if head_mask is not None else None, - None, - ) - else: - if breakmodel: - device = primary_device if i < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, i - ram_blocks) - layer_outputs = decoder_layer( - hidden_states.to(device) if breakmodel and hidden_states is not None else hidden_states, - attention_mask=attention_mask.to(device) if breakmodel and attention_mask is not None else attention_mask, - layer_head_mask=((head_mask[idx].to(device) if breakmodel and head_mask[idx] is not None else head_mask[idx]) if head_mask is not None else None), - past_key_value=tuple(v.to(device) for v in past_key_value if v is not None) if breakmodel and past_key_value is not None and i >= ram_blocks and len(past_key_value) and past_key_value[0].device.index != device else past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - if breakmodel: - if i in range(ram_blocks): - torch.cuda.synchronize() - torch.cuda.empty_cache() - - if breakmodel: - if ram_blocks: - del copystream - torch.cuda.empty_cache() - hidden_states = hidden_states.to(primary_device) - if self.project_out is not None: - hidden_states = self.project_out(hidden_states) - if breakmodel: - hidden_states = hidden_states.to(primary_device) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = next_decoder_cache if use_cache else None - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) diff --git a/modeling/inference_model.py b/modeling/inference_model.py index 491d2b05..bc11ec11 100644 --- a/modeling/inference_model.py +++ b/modeling/inference_model.py @@ -182,6 +182,16 @@ class InferenceModel: setattr(self, parameter, parameters[parameter]) return + def get_auxilary_device(self) -> Union[str, int, torch.device]: + """Get device auxilary tensors like inputs should be stored on.""" + + # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU. + if utils.koboldai_vars.hascuda and utils.koboldai_vars.usegpu: + return utils.koboldai_vars.gpu_device + elif utils.koboldai_vars.hascuda: + return "cuda" + return "cpu" + def load(self, save_model: bool = False, initial_load: bool = False) -> None: """User-facing load function. Do not override this; try `_load()` instead.""" @@ -301,7 +311,7 @@ class InferenceModel: ) start_time = time.time() - gen_in = gen_in.to(utils.get_auxilary_device()) + gen_in = gen_in.to(self.get_auxilary_device()) logger.debug( "core_generate: gen_in to device time {}s".format(time.time() - start_time) diff --git a/modeling/inference_models/generic_hf_torch/class.py b/modeling/inference_models/generic_hf_torch/class.py index d900bb95..ad17b85b 100644 --- a/modeling/inference_models/generic_hf_torch/class.py +++ b/modeling/inference_models/generic_hf_torch/class.py @@ -13,12 +13,6 @@ import modeling.lazy_loader as lazy_loader import koboldai_settings from logger import logger -try: - import breakmodel -except ModuleNotFoundError as e: - # Breakmodel is only expected to work on GPU - if not utils.koboldai_vars.use_colab_tpu: - raise e from modeling.inference_models.hf_torch import HFTorchInferenceModel @@ -26,10 +20,9 @@ model_backend_name = "Huggingface" model_backend_type = "Huggingface" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face) class model_backend(HFTorchInferenceModel): - def _initialize_model(self): return - + def _load(self, save_model: bool, initial_load: bool) -> None: utils.koboldai_vars.allowsp = True @@ -41,9 +34,7 @@ class model_backend(HFTorchInferenceModel): # utils.koboldai_vars.custmodpth = utils.koboldai_vars.model if self.model_name == "NeoCustom": - self.model_name = os.path.basename( - os.path.normpath(self.path) - ) + self.model_name = os.path.basename(os.path.normpath(self.path)) utils.koboldai_vars.model = self.model_name # If we specify a model and it's in the root directory, we need to move @@ -68,23 +59,31 @@ class model_backend(HFTorchInferenceModel): # Also, lazy loader doesn't support GPT-2 models self.lazy_load = False + logger.debug( + "lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format( + self.lazy_load, + utils.koboldai_vars.hascuda, + self.breakmodel, + self.nobreakmodel, + ) + ) + # If we're using torch_lazy_loader, we need to get breakmodel config # early so that it knows where to load the individual model tensors - logger.debug("lazy_load: {} hascuda: {} breakmodel: {} nobreakmode: {}".format(self.lazy_load, utils.koboldai_vars.hascuda, self.breakmodel, self.nobreakmodel)) if ( self.lazy_load and utils.koboldai_vars.hascuda - and self.breakmodel - and not self.nobreakmodel + and utils.koboldai_vars.breakmodel + and not utils.koboldai_vars.nobreakmodel ): - logger.debug("loading breakmodel") self.breakmodel_device_config(self.model_config) if self.lazy_load: + # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time + tf_kwargs.pop("low_cpu_mem_usage", None) + # If we're using lazy loader, we need to figure out what the model's hidden layers are called - with lazy_loader.use_lazy_load( - dematerialized_modules=True, use_accelerate_init_empty_weights=True - ): + with lazy_loader.use_lazy_load(dematerialized_modules=True): try: metamodel = AutoModelForCausalLM.from_config(self.model_config) utils.layers_module_names = utils.get_layers_module_names(metamodel) @@ -95,143 +94,92 @@ class model_backend(HFTorchInferenceModel): self.lazy_load = False # Download model from Huggingface if it does not exist, otherwise load locally - with self._maybe_use_float16(), lazy_loader.use_lazy_load( - enable=self.lazy_load, - callback=self._get_lazy_load_callback(utils.num_layers(self.model_config)) - if self.lazy_load - else None, - dematerialized_modules=True, - ): - if self.lazy_load: - # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time - tf_kwargs.pop("low_cpu_mem_usage", None) + if self.get_local_model_path(): + # Model is stored locally, load it. + self.model = self._get_model(self.get_local_model_path(), tf_kwargs) + self.tokenizer = self._get_tokenizer(self.get_local_model_path()) + else: + # Model not stored locally, we need to download it. - if self.get_local_model_path(): - # Model is stored locally, load it. - self.model = self._get_model(self.get_local_model_path(), tf_kwargs) - self.tokenizer = self._get_tokenizer(self.get_local_model_path()) - else: - # Model not stored locally, we need to download it. + # _rebuild_tensor patch for casting dtype and supporting LazyTensors + old_rebuild_tensor = torch._utils._rebuild_tensor - # _rebuild_tensor patch for casting dtype and supporting LazyTensors - old_rebuild_tensor = torch._utils._rebuild_tensor + def new_rebuild_tensor( + storage: Union[lazy_loader.LazyTensor, torch.Storage], + storage_offset, + shape, + stride, + ): + if not isinstance(storage, lazy_loader.LazyTensor): + dtype = storage.dtype + else: + dtype = storage.storage_type.dtype + if not isinstance(dtype, torch.dtype): + dtype = storage.storage_type(0).dtype + if dtype is torch.float32 and len(shape) >= 2: + utils.koboldai_vars.fp32_model = True + return old_rebuild_tensor(storage, storage_offset, shape, stride) - def new_rebuild_tensor( - storage: Union[lazy_loader.LazyTensor, torch.Storage], - storage_offset, - shape, - stride, - ): - if not isinstance(storage, lazy_loader.LazyTensor): - dtype = storage.dtype - else: - dtype = storage.storage_type.dtype - if not isinstance(dtype, torch.dtype): - dtype = storage.storage_type(0).dtype - if dtype is torch.float32 and len(shape) >= 2: - utils.koboldai_vars.fp32_model = True - return old_rebuild_tensor(storage, storage_offset, shape, stride) + torch._utils._rebuild_tensor = new_rebuild_tensor + self.model = self._get_model(self.model_name, tf_kwargs) + self.tokenizer = self._get_tokenizer(self.model_name) + torch._utils._rebuild_tensor = old_rebuild_tensor - torch._utils._rebuild_tensor = new_rebuild_tensor - self.model = self._get_model(self.model_name, tf_kwargs) - self.tokenizer = self._get_tokenizer(self.model_name) - torch._utils._rebuild_tensor = old_rebuild_tensor + if save_model: + self.tokenizer.save_pretrained( + self.get_local_model_path(ignore_existance=True) + ) - if save_model: - self.tokenizer.save_pretrained( - self.get_local_model_path(ignore_existance=True) + if utils.koboldai_vars.fp32_model: + # Use save_pretrained to convert fp32 models to fp16, + # unless we are using disk cache because save_pretrained + # is not supported in that case + self.model = self.model.half() + self.model.save_pretrained( + self.get_local_model_path(ignore_existance=True), + max_shard_size="500MiB", ) - if utils.koboldai_vars.fp32_model and not breakmodel.disk_blocks: - # Use save_pretrained to convert fp32 models to fp16, - # unless we are using disk cache because save_pretrained - # is not supported in that case - self.model = self.model.half() - self.model.save_pretrained( - self.get_local_model_path(ignore_existance=True), - max_shard_size="500MiB", - ) + else: + # For fp16 models, we can just copy the model files directly + import transformers.configuration_utils + import transformers.modeling_utils + import transformers.file_utils + import huggingface_hub - else: - # For fp16 models, we can just copy the model files directly - import transformers.configuration_utils - import transformers.modeling_utils - import transformers.file_utils - import huggingface_hub - - # Save the config.json - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - self.model_name, - transformers.configuration_utils.CONFIG_NAME, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=False, - ) - ), - os.path.join( - self.get_local_model_path(ignore_existance=True), + # Save the config.json + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, transformers.configuration_utils.CONFIG_NAME, - ), - ) - - if utils.num_shards is None: - # Save the pytorch_model.bin or model.safetensors of an unsharded model - any_success = False - possible_checkpoint_names = [ - transformers.modeling_utils.WEIGHTS_NAME, - "model.safetensors", - ] - - for possible_checkpoint_name in possible_checkpoint_names: - try: - shutil.move( - os.path.realpath( - huggingface_hub.hf_hub_download( - self.model_name, - possible_checkpoint_name, - revision=utils.koboldai_vars.revision, - cache_dir="cache", - local_files_only=True, - legacy_cache_layout=False, - ) - ), - os.path.join( - self.get_local_model_path( - ignore_existance=True - ), - possible_checkpoint_name, - ), - ) - any_success = True - except Exception: - pass - - if not any_success: - raise RuntimeError(f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'") - else: - # Handle saving sharded models - - with open(utils.from_pretrained_index_filename) as f: - map_data = json.load(f) - filenames = set(map_data["weight_map"].values()) - # Save the pytorch_model.bin.index.json of a sharded model - shutil.move( - os.path.realpath(utils.from_pretrained_index_filename), - os.path.join( - self.get_local_model_path(ignore_existance=True), - transformers.modeling_utils.WEIGHTS_INDEX_NAME, - ), + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, ) - # Then save the pytorch_model-#####-of-#####.bin files - for filename in filenames: + ), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.configuration_utils.CONFIG_NAME, + ), + ) + + if utils.num_shards is None: + # Save the pytorch_model.bin or model.safetensors of an unsharded model + any_success = False + possible_checkpoint_names = [ + transformers.modeling_utils.WEIGHTS_NAME, + "model.safetensors", + ] + + for possible_checkpoint_name in possible_checkpoint_names: + try: shutil.move( os.path.realpath( huggingface_hub.hf_hub_download( self.model_name, - filename, + possible_checkpoint_name, revision=utils.koboldai_vars.revision, cache_dir="cache", local_files_only=True, @@ -242,38 +190,70 @@ class model_backend(HFTorchInferenceModel): self.get_local_model_path( ignore_existance=True ), - filename, + possible_checkpoint_name, ), ) - shutil.rmtree("cache/") + any_success = True + except Exception: + pass + + if not any_success: + raise RuntimeError( + f"Couldn't find any of {possible_checkpoint_names} in cache for {self.model_name} @ '{utils.koboldai_vars.revisison}'" + ) + else: + # Handle saving sharded models + + with open(utils.from_pretrained_index_filename) as f: + map_data = json.load(f) + filenames = set(map_data["weight_map"].values()) + # Save the pytorch_model.bin.index.json of a sharded model + shutil.move( + os.path.realpath(utils.from_pretrained_index_filename), + os.path.join( + self.get_local_model_path(ignore_existance=True), + transformers.modeling_utils.WEIGHTS_INDEX_NAME, + ), + ) + # Then save the pytorch_model-#####-of-#####.bin files + for filename in filenames: + shutil.move( + os.path.realpath( + huggingface_hub.hf_hub_download( + self.model_name, + filename, + revision=utils.koboldai_vars.revision, + cache_dir="cache", + local_files_only=True, + legacy_cache_layout=False, + ) + ), + os.path.join( + self.get_local_model_path(ignore_existance=True), + filename, + ), + ) + shutil.rmtree("cache/") self.patch_embedding() - - if utils.koboldai_vars.hascuda: - if self.usegpu or self.nobreakmodel: - # Use just VRAM - self.model = self.model.half().to(utils.koboldai_vars.gpu_device) - elif self.breakmodel: - # Use both RAM and VRAM (breakmodel) - if not self.lazy_load: - self.breakmodel_device_config(self.model.config) - self._move_to_devices() - elif breakmodel.disk_blocks > 0: - # Use disk - self._move_to_devices() - else: - # Use CPU - self.model = self.model.to("cpu").float() - elif breakmodel.disk_blocks > 0: - self._move_to_devices() - else: - self.model = self.model.to("cpu").float() - - self.model.kai_model = self utils.koboldai_vars.modeldim = self.get_hidden_size() def _save_settings(self): - with open("settings/{}.generic_hf_torch.model_backend.settings".format(self.model_name.replace("/", "_")), "w") as f: - json.dump({"layers": self.layers if 'layers' in vars(self) else [], "disk_layers": self.disk_layers if 'disk_layers' in vars(self) else 0}, f, indent="") \ No newline at end of file + with open( + "settings/{}.generic_hf_torch.model_backend.settings".format( + self.model_name.replace("/", "_") + ), + "w", + ) as f: + json.dump( + { + "layers": self.layers if "layers" in vars(self) else [], + "disk_layers": self.disk_layers + if "disk_layers" in vars(self) + else 0, + }, + f, + indent="", + ) diff --git a/modeling/inference_models/hf.py b/modeling/inference_models/hf.py index 4226d1b1..881ca604 100644 --- a/modeling/inference_models/hf.py +++ b/modeling/inference_models/hf.py @@ -157,7 +157,6 @@ class HFInferenceModel(InferenceModel): def set_input_parameters(self, parameters): if self.hf_torch and hasattr(self, "get_model_type") and self.get_model_type() != "gpt2": - import breakmodel layer_count = self.model_config["n_layer"] if isinstance(self.model_config, dict) else self.model_config.num_layers if hasattr(self.model_config, "num_layers") else self.model_config.n_layer if hasattr(self.model_config, "n_layer") else self.model_config.num_hidden_layers if hasattr(self.model_config, 'num_hidden_layers') else None if layer_count is not None and layer_count >= 0 and not self.nobreakmodel: gpu_count = torch.cuda.device_count() @@ -176,9 +175,8 @@ class HFInferenceModel(InferenceModel): self.disk_layers = parameters['Disk_Layers'] if 'Disk_Layers' in parameters else 0 if isinstance(self.disk_layers, str): self.disk_layers = int(self.disk_layers) if self.disk_layers.isnumeric() else 0 - breakmodel.gpu_blocks = layers - breakmodel.disk_blocks = self.disk_layers - self.usegpu = self.cpu_layers == 0 and breakmodel.disk_blocks == 0 and sum(self.layers)-self.layers[0] == 0 + print("TODO: Allow config") + # self.usegpu = self.cpu_layers == 0 and breakmodel.disk_blocks == 0 and sum(self.layers)-self.layers[0] == 0 self.model_type = self.get_model_type() self.breakmodel = ((self.model_type != 'gpt2') or self.model_type in ("gpt_neo", "gptj", "xglm", "opt")) and not self.nobreakmodel self.lazy_load = True diff --git a/modeling/inference_models/hf_torch.py b/modeling/inference_models/hf_torch.py index 2f575e73..fb52bac1 100644 --- a/modeling/inference_models/hf_torch.py +++ b/modeling/inference_models/hf_torch.py @@ -1,15 +1,13 @@ from __future__ import annotations +from dataclasses import dataclass -import gc import os import time import bisect -import zipfile -import functools import itertools import traceback import contextlib -from tqdm.auto import tqdm +from torch import nn from typing import Dict, List, Optional, Union import torch @@ -39,19 +37,52 @@ from modeling.inference_model import ( use_core_manipulations, ) -try: - import breakmodel - import accelerate.utils -except ModuleNotFoundError as e: - if not utils.koboldai_vars.use_colab_tpu: - raise e - # When set to true, messages will appear in the console if samplers are not # changing the scores. Keep in mind some samplers don't always change the # scores for each token. LOG_SAMPLER_NO_EFFECT = False +class BreakmodelConfig: + def __init__(self) -> None: + self.disk_blocks = 0 + self.gpu_blocks = [] + + @property + def primary_device(self): + if utils.args.cpu: + return "cpu" + elif not sum(self.gpu_blocks): + # No blocks are on GPU + return "cpu" + elif torch.cuda.device_count() <= 0: + return "cpu" + + for device_index, blocks in enumerate(self.gpu_blocks): + if blocks: + return device_index + return 0 + + def get_device_map(self, model: nn.Module) -> dict: + ram_blocks = len(utils.layers_module_names) - sum(self.gpu_blocks) + cumulative_gpu_blocks = tuple(itertools.accumulate(self.gpu_blocks)) + device_map = {} + + for name in utils.layers_module_names: + layer = int(name.rsplit(".", 1)[1]) + device = ( + ("disk" if layer < self.disk_blocks else "cpu") + if layer < ram_blocks + else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) + ) + device_map[name] = device + + for name in utils.get_missing_module_names(model, list(device_map.keys())): + device_map[name] = self.primary_device + + return device_map + + class HFTorchInferenceModel(HFInferenceModel): def __init__(self) -> None: super().__init__() @@ -79,6 +110,29 @@ class HFTorchInferenceModel(HFInferenceModel): post_token_probs=True, ) self._old_stopping_criteria = None + self.breakmodel_config = BreakmodelConfig() + + def set_input_parameters(self, parameters): + ret = super().set_input_parameters(parameters) + + # Hook onto input param setting for setting breakmodel stuff + if self.breakmodel: + self.breakmodel_config.gpu_blocks = self.layers + self.breakmodel_config.disk_blocks = self.disk_layers + + return ret + + def get_auxilary_device(self) -> Union[str, int, torch.device]: + return self.breakmodel_config.primary_device + + def _get_target_dtype(self) -> Union[torch.float16, torch.float32]: + if self.breakmodel_config.primary_device == "cpu": + return torch.float32 + elif utils.args.cpu: + return torch.float32 + elif not self.usegpu and not self.breakmodel: + return torch.float32 + return torch.float16 def _apply_warpers( self, scores: torch.Tensor, input_ids: torch.Tensor @@ -125,19 +179,7 @@ class HFTorchInferenceModel(HFInferenceModel): else: return "Unknown" - def get_auxilary_device(self): - """Get device auxilary tensors like inputs should be stored on.""" - - # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU. - if utils.koboldai_vars.hascuda and self.usegpu: - return utils.koboldai_vars.gpu_device - elif utils.koboldai_vars.hascuda and self.breakmodel: - import breakmodel - return breakmodel.primary_device - return "cpu" - def _post_load(m_self) -> None: - if not utils.koboldai_vars.model_type: utils.koboldai_vars.model_type = m_self.get_model_type() @@ -236,9 +278,11 @@ class HFTorchInferenceModel(HFInferenceModel): gen_in = torch.tensor(prompt_tokens, dtype=torch.long)[None] else: gen_in = prompt_tokens - - device = self.get_auxilary_device() - gen_in = gen_in.to(device) + if not self.usegpu and not self.breakmodel: + gen_in = gen_in.to("cpu") + else: + device = self.get_auxilary_device() + gen_in = gen_in.to(device) additional_bad_words_ids = [self.tokenizer.encode("\n")] if single_line else [] @@ -254,8 +298,7 @@ class HFTorchInferenceModel(HFInferenceModel): len(prompt_tokens) + max_new, utils.koboldai_vars.max_length ), repetition_penalty=1.0, - bad_words_ids=self.badwordsids - + additional_bad_words_ids, + bad_words_ids=self.badwordsids + additional_bad_words_ids, use_cache=True, num_return_sequences=batch_count, ) @@ -275,6 +318,9 @@ class HFTorchInferenceModel(HFInferenceModel): tf_kwargs["revision"] = utils.koboldai_vars.revision tf_kwargs["cache_dir"] = "cache" + if self.lazy_load: + tf_kwargs.pop("low_cpu_mem_usage", None) + # If we have model hints for legacy model, use them rather than fall back. try: if self.model_name == "GPT2Custom": @@ -286,7 +332,41 @@ class HFTorchInferenceModel(HFInferenceModel): # Try to determine model type from either AutoModel or falling back to legacy try: - return AutoModelForCausalLM.from_pretrained(location, **tf_kwargs) + if self.lazy_load: + with lazy_loader.use_lazy_load(dematerialized_modules=True): + metamodel = AutoModelForCausalLM.from_config(self.model_config) + if utils.args.cpu: + cpu_map = {name: "cpu" for name in utils.layers_module_names} + for name in utils.get_missing_module_names( + metamodel, list(cpu_map.keys()) + ): + cpu_map[name] = "cpu" + tf_kwargs["device_map"] = cpu_map + else: + tf_kwargs["device_map"] = self.breakmodel_config.get_device_map( + metamodel + ) + + with lazy_loader.use_lazy_load( + enable=self.lazy_load, + # DO NOT DEMATERIALIZE MODULES / INIT WEIGHTS EMPTY!!! IT WILL EXPLODE!!!!!!! + dematerialized_modules=False, + ): + model = AutoModelForCausalLM.from_pretrained( + location, + offload_folder="accelerate-disk-cache", + torch_dtype=self._get_target_dtype(), + **tf_kwargs, + ) + + if not self.lazy_load: + # We need to move the model to the desired device + if (not self.usegpu) or torch.cuda.device_count() <= 0: + model = model.to("cpu") + else: + model = model.to("cuda") + + return model except Exception as e: traceback_string = traceback.format_exc().lower() @@ -325,49 +405,6 @@ class HFTorchInferenceModel(HFInferenceModel): return True - def _move_to_devices(self) -> None: - for key, value in self.model.state_dict().items(): - target_dtype = ( - torch.float32 if breakmodel.primary_device == "cpu" else torch.float16 - ) - if value.dtype is not target_dtype: - accelerate.utils.set_module_tensor_to_device( - self.model, - tensor_name=key, - device=torch.device(value.device), - value=value, - dtype=target_dtype, - ) - - disk_blocks = breakmodel.disk_blocks - gpu_blocks = breakmodel.gpu_blocks - ram_blocks = len(utils.layers_module_names) - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - device_map = {} - - for name in utils.layers_module_names: - layer = int(name.rsplit(".", 1)[1]) - device = ( - ("disk" if layer < disk_blocks else "cpu") - if layer < ram_blocks - else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) - ) - device_map[name] = device - - for name in utils.get_missing_module_names(self.model, list(device_map.keys())): - device_map[name] = breakmodel.primary_device - - breakmodel.dispatch_model_ex( - self.model, - device_map, - main_device=breakmodel.primary_device, - offload_buffers=True, - offload_dir="accelerate-disk-cache", - ) - - gc.collect() - return - # Function to patch transformers to use our soft prompt def patch_embedding(self) -> None: if getattr(Embedding, "_koboldai_patch_causallm_model", None): @@ -409,401 +446,20 @@ class HFTorchInferenceModel(HFInferenceModel): Embedding.__call__ = new_embedding_call Embedding._koboldai_patch_causallm_model = self.model - def _get_lazy_load_callback(self, n_layers: int, convert_to_float16: bool = True): - if not self.lazy_load: - return - - - disk_blocks = breakmodel.disk_blocks - gpu_blocks = breakmodel.gpu_blocks - ram_blocks = ram_blocks = n_layers - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - - def lazy_load_callback( - model_dict: Dict[str, Union[lazy_loader.LazyTensor, torch.Tensor]], - f, - is_safetensors: bool = False, - **_, - ): - if lazy_load_callback.nested: - return - lazy_load_callback.nested = True - - device_map: Dict[str, Union[str, int]] = {} - - @functools.lru_cache(maxsize=None) - def get_original_key(key) -> Optional[str]: - key_candidates = [ - original_key - for original_key in utils.module_names - if original_key.endswith(key) - ] - - if not key_candidates: - logger.debug(f"!!! No key candidates for {key}") - return None - - return max(key_candidates, key=len) - - for key, value in model_dict.items(): - original_key = get_original_key(key) - - if not original_key: - continue - - if isinstance(value, lazy_loader.LazyTensor) and not any( - original_key.startswith(n) for n in utils.layers_module_names - ): - device_map[key] = ( - utils.koboldai_vars.gpu_device - if utils.koboldai_vars.hascuda and self.usegpu - else "cpu" - if not utils.koboldai_vars.hascuda - or not self.breakmodel - else breakmodel.primary_device - ) - else: - layer = int( - max( - ( - n - for n in utils.layers_module_names - if original_key.startswith(n) - ), - key=len, - ).rsplit(".", 1)[1] - ) - device = ( - utils.koboldai_vars.gpu_device - if utils.koboldai_vars.hascuda and self.usegpu - else "disk" - if layer < disk_blocks and layer < ram_blocks - else "cpu" - if not utils.koboldai_vars.hascuda - or not self.breakmodel - else "shared" - if layer < ram_blocks - else bisect.bisect_right( - cumulative_gpu_blocks, layer - ram_blocks - ) - ) - device_map[key] = device - - if utils.num_shards is None or utils.current_shard == 0: - utils.offload_index = {} - if os.path.isdir("accelerate-disk-cache"): - # Delete all of the files in the disk cache folder without deleting the folder itself to allow people to create symbolic links for this folder - # (the folder doesn't contain any subfolders so os.remove will do just fine) - for filename in os.listdir("accelerate-disk-cache"): - try: - os.remove(os.path.join("accelerate-disk-cache", filename)) - except OSError: - pass - os.makedirs("accelerate-disk-cache", exist_ok=True) - if utils.num_shards is not None: - num_tensors = len( - utils.get_sharded_checkpoint_num_tensors( - utils.from_pretrained_model_name, - utils.from_pretrained_index_filename, - is_safetensors=is_safetensors, - **utils.from_pretrained_kwargs, - ) - ) - else: - num_tensors = len(device_map) - print(flush=True) - utils.koboldai_vars.status_message = "Loading model" - utils.koboldai_vars.total_layers = num_tensors - utils.koboldai_vars.loaded_layers = 0 - utils.bar = tqdm( - total=num_tensors, - desc="Loading model tensors", - file=utils.UIProgressBarFile(), - position=1 - ) - - if not is_safetensors: - # Torch lazyload - with zipfile.ZipFile(f, "r") as z: - try: - last_storage_key = None - zipfolder = os.path.basename(os.path.normpath(f)).split(".")[0] - f = None - current_offset = 0 - able_to_pin_layers = True - if utils.num_shards is not None: - utils.current_shard += 1 - for key in sorted( - device_map.keys(), - key=lambda k: ( - model_dict[k].key, - model_dict[k].seek_offset, - ), - ): - storage_key = model_dict[key].key - if ( - storage_key != last_storage_key - or model_dict[key].seek_offset < current_offset - ): - last_storage_key = storage_key - if isinstance(f, zipfile.ZipExtFile): - f.close() - ziproot = z.namelist()[0].split("/")[0] - f = z.open(f"{ziproot}/data/{storage_key}") - - current_offset = 0 - if current_offset != model_dict[key].seek_offset: - f.read(model_dict[key].seek_offset - current_offset) - current_offset = model_dict[key].seek_offset - device = device_map[key] - size = functools.reduce( - lambda x, y: x * y, model_dict[key].shape, 1 - ) - dtype = model_dict[key].dtype - nbytes = ( - size - if dtype is torch.bool - else size - * ( - ( - torch.finfo - if dtype.is_floating_point - else torch.iinfo - )(dtype).bits - >> 3 - ) - ) - # print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) - #logger.debug(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ") - model_dict[key] = model_dict[key].materialize( - f, map_location="cpu" - ) - if model_dict[key].dtype is torch.float32: - utils.koboldai_vars.fp32_model = True - if ( - convert_to_float16 - and breakmodel.primary_device != "cpu" - and utils.koboldai_vars.hascuda - and ( - self.breakmodel - or self.usegpu - ) - and model_dict[key].dtype is torch.float32 - ): - model_dict[key] = model_dict[key].to(torch.float16) - if breakmodel.primary_device == "cpu" or ( - not self.usegpu - and not self.breakmodel - and model_dict[key].dtype is torch.float16 - ): - model_dict[key] = model_dict[key].to(torch.float32) - if device == "shared": - model_dict[key] = model_dict[key].to("cpu").detach_() - if able_to_pin_layers: - try: - model_dict[key] = model_dict[key].pin_memory() - except: - able_to_pin_layers = False - elif device == "disk": - accelerate.utils.offload_weight( - model_dict[key], - get_original_key(key), - "accelerate-disk-cache", - index=utils.offload_index, - ) - model_dict[key] = model_dict[key].to("meta") - else: - model_dict[key] = model_dict[key].to(device) - # print("OK", flush=True) - current_offset += nbytes - utils.bar.update(1) - utils.koboldai_vars.loaded_layers += 1 - finally: - if ( - utils.num_shards is None - or utils.current_shard >= utils.num_shards - ): - if utils.offload_index: - for name, tensor in utils.named_buffers: - dtype = tensor.dtype - if ( - convert_to_float16 - and breakmodel.primary_device != "cpu" - and utils.koboldai_vars.hascuda - and ( - self.breakmodel - or self.usegpu - ) - ): - dtype = torch.float16 - if breakmodel.primary_device == "cpu" or ( - not self.usegpu - and not self.breakmodel - ): - dtype = torch.float32 - if ( - name in model_dict - and model_dict[name].dtype is not dtype - ): - model_dict[name] = model_dict[name].to(dtype) - if tensor.dtype is not dtype: - tensor = tensor.to(dtype) - if name not in utils.offload_index: - accelerate.utils.offload_weight( - tensor, - name, - "accelerate-disk-cache", - index=utils.offload_index, - ) - accelerate.utils.save_offload_index( - utils.offload_index, "accelerate-disk-cache" - ) - utils.bar.close() - utils.bar = None - utils.koboldai_vars.status_message = "" - lazy_load_callback.nested = False - if isinstance(f, zipfile.ZipExtFile): - f.close() - else: - # Loading with safetensors - try: - able_to_pin_layers = True - - if utils.num_shards is not None: - utils.current_shard += 1 - - for key in sorted( - device_map.keys(), - key=lambda k: model_dict[k].key, - ): - storage_key = model_dict[key].key - - device = device_map[key] - - # print(f"Transferring <{key}> to {f'({device.upper()})' if isinstance(device, str) else '[device ' + str(device) + ']'} ... ", end="", flush=True) - - model_dict[key] = model_dict[key].materialize( - f, map_location="cpu" - ) - - if model_dict[key].dtype is torch.float32: - utils.koboldai_vars.fp32_model = True - - if ( - convert_to_float16 - and breakmodel.primary_device != "cpu" - and utils.koboldai_vars.hascuda - and ( - self.breakmodel - or self.usegpu - ) - and model_dict[key].dtype is torch.float32 - ): - model_dict[key] = model_dict[key].to(torch.float16) - - if breakmodel.primary_device == "cpu" or ( - not self.usegpu - and not self.breakmodel - and model_dict[key].dtype is torch.float16 - ): - model_dict[key] = model_dict[key].to(torch.float32) - - if device == "shared": - model_dict[key] = model_dict[key].to("cpu").detach_() - if able_to_pin_layers: - try: - model_dict[key] = model_dict[key].pin_memory() - except: - able_to_pin_layers = False - elif device == "disk": - accelerate.utils.offload_weight( - model_dict[key], - get_original_key(key), - "accelerate-disk-cache", - index=utils.offload_index, - ) - model_dict[key] = model_dict[key].to("meta") - else: - model_dict[key] = model_dict[key].to(device) - - utils.bar.update(1) - utils.koboldai_vars.loaded_layers += 1 - - finally: - if ( - utils.num_shards is None - or utils.current_shard >= utils.num_shards - ): - if utils.offload_index: - for name, tensor in utils.named_buffers: - dtype = tensor.dtype - if ( - convert_to_float16 - and breakmodel.primary_device != "cpu" - and utils.koboldai_vars.hascuda - and ( - self.breakmodel - or self.usegpu - ) - ): - dtype = torch.float16 - if breakmodel.primary_device == "cpu" or ( - not self.usegpu - and not self.breakmodel - ): - dtype = torch.float32 - if ( - name in model_dict - and model_dict[name].dtype is not dtype - ): - model_dict[name] = model_dict[name].to(dtype) - if tensor.dtype is not dtype: - tensor = tensor.to(dtype) - if name not in utils.offload_index: - accelerate.utils.offload_weight( - tensor, - name, - "accelerate-disk-cache", - index=utils.offload_index, - ) - accelerate.utils.save_offload_index( - utils.offload_index, "accelerate-disk-cache" - ) - utils.bar.close() - utils.bar = None - utils.koboldai_vars.status_message = "" - - lazy_load_callback.nested = False - - lazy_load_callback.nested = False - return lazy_load_callback - - @contextlib.contextmanager - def _maybe_use_float16(self, always_use: bool = False): - if always_use or ( - utils.koboldai_vars.hascuda - and self.low_mem - and (self.usegpu or self.breakmodel) - ): - original_dtype = torch.get_default_dtype() - torch.set_default_dtype(torch.float16) - yield True - torch.set_default_dtype(original_dtype) - else: - yield False - def breakmodel_device_list(self, n_layers, primary=None, selected=None): - # TODO: Find a better place for this or rework this - device_count = torch.cuda.device_count() if device_count < 2: primary = None + logger.debug("n_layers: {}".format(n_layers)) - logger.debug("gpu blocks: {}".format(breakmodel.gpu_blocks)) - gpu_blocks = breakmodel.gpu_blocks + ( - device_count - len(breakmodel.gpu_blocks) + logger.debug("gpu blocks: {}".format(self.breakmodel_config.gpu_blocks)) + + gpu_blocks = self.breakmodel_config.gpu_blocks + ( + device_count - len(self.breakmodel_config.gpu_blocks) ) * [0] + print(f"{Colors.YELLOW} DEVICE ID | LAYERS | DEVICE NAME{Colors.END}") + for i in range(device_count): name = torch.cuda.get_device_name(i) if len(name) > 47: @@ -813,72 +469,83 @@ class HFTorchInferenceModel(HFInferenceModel): print( f"{row_color}{Colors.YELLOW + '->' + row_color if i == selected else ' '} {'(primary)' if i == primary else ' '*9} {i:3} {sep_color}|{row_color} {gpu_blocks[i]:3} {sep_color}|{row_color} {name}{Colors.END}" ) + row_color = Colors.END sep_color = Colors.YELLOW print( - f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {breakmodel.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}" + f"{row_color}{Colors.YELLOW + '->' + row_color if -1 == selected else ' '} {' '*9} N/A {sep_color}|{row_color} {self.breakmodel_config.disk_blocks:3} {sep_color}|{row_color} (Disk cache){Colors.END}" ) print( f"{row_color} {' '*9} N/A {sep_color}|{row_color} {n_layers:3} {sep_color}|{row_color} (CPU){Colors.END}" ) def breakmodel_device_config(self, config): - # TODO: Find a better place for this or rework this - - global breakmodel, generator - import breakmodel - n_layers = utils.num_layers(config) - logger.debug("gpu blocks before modification: {}".format(breakmodel.gpu_blocks)) + logger.debug( + "gpu blocks before modification: {}".format( + self.breakmodel_config.gpu_blocks + ) + ) if utils.args.cpu: - breakmodel.gpu_blocks = [0] * n_layers + self.breakmodel_config.gpu_blocks = [0] * n_layers return - elif breakmodel.gpu_blocks == []: + elif self.breakmodel_config.gpu_blocks == []: logger.info("Breakmodel not specified, assuming GPU 0") - breakmodel.gpu_blocks = [n_layers] + self.breakmodel_config.gpu_blocks = [n_layers] n_layers = 0 - + else: s = n_layers - for i in range(len(breakmodel.gpu_blocks)): - if breakmodel.gpu_blocks[i] <= -1: - breakmodel.gpu_blocks[i] = s + for i in range(len(self.breakmodel_config.gpu_blocks)): + if self.breakmodel_config.gpu_blocks[i] <= -1: + self.breakmodel_config.gpu_blocks[i] = s break else: - s -= breakmodel.gpu_blocks[i] - assert sum(breakmodel.gpu_blocks) <= n_layers - n_layers -= sum(breakmodel.gpu_blocks) - if breakmodel.disk_blocks is not None: - assert breakmodel.disk_blocks <= n_layers - n_layers -= breakmodel.disk_blocks + s -= self.breakmodel_config.gpu_blocks[i] + assert sum(self.breakmodel_config.gpu_blocks) <= n_layers + n_layers -= sum(self.breakmodel_config.gpu_blocks) + if self.breakmodel_config.disk_blocks is not None: + assert self.breakmodel_config.disk_blocks <= n_layers + n_layers -= self.breakmodel_config.disk_blocks logger.init_ok("Final device configuration:", status="Info") - self.breakmodel_device_list(n_layers, primary=breakmodel.primary_device) - with open("settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w") as file: - file.write("{}\n{}".format(",".join(map(str, breakmodel.gpu_blocks)), breakmodel.disk_blocks)) + self.breakmodel_device_list( + n_layers, primary=self.breakmodel_config.primary_device + ) + with open( + "settings/{}.breakmodel".format(self.model_name.replace("/", "_")), "w" + ) as file: + file.write( + "{}\n{}".format( + ",".join(map(str, self.breakmodel_config.gpu_blocks)), + self.breakmodel_config.disk_blocks, + ) + ) # If all layers are on the same device, use the old GPU generation mode - while len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] == 0: - breakmodel.gpu_blocks.pop() + while ( + len(self.breakmodel_config.gpu_blocks) + and self.breakmodel_config.gpu_blocks[-1] == 0 + ): + self.breakmodel_config.gpu_blocks.pop() self.breakmodel = True - if len(breakmodel.gpu_blocks) and breakmodel.gpu_blocks[-1] in ( + if len(self.breakmodel_config.gpu_blocks) and self.breakmodel_config.gpu_blocks[ + -1 + ] in ( -1, utils.num_layers(config), ): logger.debug("All layers on same GPU. Breakmodel disabled") self.breakmodel = False self.usegpu = True - utils.koboldai_vars.gpu_device = len(breakmodel.gpu_blocks) - 1 + utils.koboldai_vars.gpu_device = len(self.breakmodel_config.gpu_blocks) - 1 return - if not breakmodel.gpu_blocks: + if not self.breakmodel_config.gpu_blocks: logger.warning("Nothing assigned to a GPU, reverting to CPU only mode") - import breakmodel - - breakmodel.primary_device = "cpu" self.breakmodel = False self.usegpu = False return diff --git a/modeling/lazy_loader.py b/modeling/lazy_loader.py index 5a27d549..b61f9be6 100644 --- a/modeling/lazy_loader.py +++ b/modeling/lazy_loader.py @@ -47,6 +47,7 @@ POSSIBILITY OF SUCH DAMAGE. import contextlib from functools import reduce import itertools +import time import zipfile import pickle import torch @@ -54,35 +55,57 @@ import numpy as np import collections import _codecs import os +from typing import Any, Callable, Dict, Optional, Tuple, Type +import accelerate + from torch.nn import Module -from typing import Any, Callable, Dict, Optional, Tuple, Type, Union +from torch.storage import UntypedStorage # Safetensors is a dependency for the local version, TPU/Colab doesn't # support it yet. try: import safetensors + HAS_SAFETENSORS = True except ModuleNotFoundError: HAS_SAFETENSORS = False import utils +from logger import logger + +# Storage of zipfile handles for each shard +torch_checkpoint_file_handles = {} -_EXTRA_STATE_KEY_SUFFIX = "_extra_state" +class CheckpointChunkCache: + """Storage for common checkpoint weight files to speed up loading. In order + for this to be effective at all, weights must be loaded in ascending order + of (key, seek_offset). + """ + # There is considerable room for improvement here; we could peek into the + # state dict and preload the N most frequent weight files or something, but + # this first implementation is on par with the speed of whatever the + # previous callback did. -STORAGE_TYPE_MAP = { - torch.float64: torch.DoubleStorage, - torch.float32: torch.FloatStorage, - torch.float16: torch.HalfStorage, - torch.int64: torch.LongStorage, - torch.int32: torch.IntStorage, - torch.int16: torch.ShortStorage, - torch.int8: torch.CharStorage, - torch.uint8: torch.ByteStorage, - torch.bool: torch.BoolStorage, - torch.bfloat16: torch.BFloat16Storage, -} + file_name = None + key = None + handle = None + + hit_data = {"hits": 0, "misses": 0} + + @classmethod + def clear(cls, unload_model: bool = False) -> None: + if unload_model: + cls.hit_data["hits"] = 0 + cls.hit_data["misses"] = 0 + + if cls.handle: + cls.handle.close() + + cls.file_name = None + cls.key = None + cls.handle = None class LazyTensor: @@ -111,6 +134,7 @@ class TorchLazyTensor(LazyTensor): self.stride = stride self.requires_grad = requires_grad self.backward_hooks = backward_hooks + self.file_name = None def __view(self, f: Callable): return f"{type(self).__name__}(storage_type={f(self.storage_type)}, key={f(self.key)}, location={f(self.location)}, dtype={f(self.dtype)}, seek_offset={f(self.seek_offset)}, shape={f(self.shape)}, stride={f(self.stride)}, requires_grad={f(self.requires_grad)}, backward_hooks={f(self.backward_hooks)})" @@ -120,12 +144,42 @@ class TorchLazyTensor(LazyTensor): def materialize( self, - checkpoint: Union[zipfile.ZipFile, zipfile.ZipExtFile], map_location=None, no_grad=True, - filename="pytorch_model.bin", ) -> torch.Tensor: - filename = os.path.basename(os.path.normpath(filename)).split(".")[0] + checkpoint = torch_checkpoint_file_handles[self.file_name] + filename = os.path.basename(os.path.normpath(self.file_name)).split(".")[0] + + # Often we are using the same weight file to store multiple tensors, so + # let's cache the file handle to maintain a seek position and other + # fast stuff. + if ( + CheckpointChunkCache.file_name != filename + or CheckpointChunkCache.key != self.key + or not CheckpointChunkCache.handle + ): + # Cache miss. Assuming weights are loaded in order of + # (key, seek_offset), this means we need to invalidate the cache. + # print("!", end="", flush=True) + CheckpointChunkCache.hit_data["misses"] += 1 + + CheckpointChunkCache.clear() + + CheckpointChunkCache.file_name = filename + CheckpointChunkCache.key = self.key + try: + CheckpointChunkCache.handle = checkpoint.open( + f"archive/data/{self.key}", "r" + ) + except KeyError: + CheckpointChunkCache.handle = checkpoint.open( + f"{filename}/data/{self.key}", "r" + ) + else: + # Cache hit. Hip hip hooray! :^) + # print(".", end="", flush=True) + CheckpointChunkCache.hit_data["hits"] += 1 + size = reduce(lambda x, y: x * y, self.shape, 1) dtype = self.dtype nbytes = ( @@ -133,27 +187,24 @@ class TorchLazyTensor(LazyTensor): if dtype is torch.bool else size * ( - (torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits + (torch.finfo if self.dtype.is_floating_point else torch.iinfo)( + self.dtype + ).bits >> 3 ) ) - if isinstance(checkpoint, zipfile.ZipFile): - try: - f = checkpoint.open(f"archive/data/{self.key}", "r") - except: - f = checkpoint.open(f"{filename}/data/{self.key}", "r") - f.read(self.seek_offset) - else: - f = checkpoint - try: - storage = STORAGE_TYPE_MAP[dtype].from_buffer(f.read(nbytes), "little") - finally: - if isinstance(checkpoint, zipfile.ZipFile): - f.close() + + assert isinstance(checkpoint, zipfile.ZipFile) + + CheckpointChunkCache.handle.seek(self.seek_offset, os.SEEK_SET) + storage = UntypedStorage.from_buffer( + CheckpointChunkCache.handle.read(nbytes), "little", dtype=self.dtype + ) + storage = torch.serialization._get_restore_location(map_location)( storage, self.location ) - tensor = torch.tensor([], dtype=storage.dtype, device=storage.device) + tensor = torch.tensor([], dtype=self.dtype, device=storage.device) tensor.set_(storage, 0, self.shape, self.stride) tensor.requires_grad = not no_grad and self.requires_grad tensor._backward_hooks = self.backward_hooks @@ -237,6 +288,7 @@ class _LazyUnpickler(RestrictedUnpickler): lazy_loaded_storages: Dict[str, LazyTensor] def __init__(self, *args, **kwargs): + # print(args, kwargs) self.lazy_loaded_storages = {} return super().__init__(*args, **kwargs) @@ -271,117 +323,6 @@ def _rebuild_tensor(lazy_storage: LazyTensor, storage_offset, shape, stride): return lazy_storage -# Modified version of https://github.com/pytorch/pytorch/blob/v1.11.0-rc4/torch/nn/modules/module.py#L1346-L1438 -def _load_from_state_dict( - self, - state_dict, - prefix, - local_metadata, - strict, - missing_keys, - unexpected_keys, - error_msgs, -): - for hook in self._load_state_dict_pre_hooks.values(): - hook( - state_dict, - prefix, - local_metadata, - strict, - missing_keys, - unexpected_keys, - error_msgs, - ) - - persistent_buffers = { - k: v - for k, v in self._buffers.items() - if k not in self._non_persistent_buffers_set - } - local_name_params = itertools.chain( - self._parameters.items(), persistent_buffers.items() - ) - local_state = {k: v for k, v in local_name_params if v is not None} - - for name, param in local_state.items(): - key = prefix + name - if key in state_dict: - input_param = state_dict[key] - if not torch.overrides.is_tensor_like(input_param): - error_msgs.append( - 'While copying the parameter named "{}", ' - "expected torch.Tensor or Tensor-like object from checkpoint but " - "received {}".format(key, type(input_param)) - ) - continue - - # This is used to avoid copying uninitialized parameters into - # non-lazy modules, since they dont have the hook to do the checks - # in such case, it will error when accessing the .shape attribute. - is_param_lazy = torch.nn.parameter.is_lazy(param) - # Backward compatibility: loading 1-dim tensor from 0.3.* to version 0.4+ - if ( - not is_param_lazy - and len(param.shape) == 0 - and len(input_param.shape) == 1 - ): - input_param = input_param[0] - - if not is_param_lazy and input_param.shape != param.shape: - # local shape should match the one in checkpoint - error_msgs.append( - "size mismatch for {}: copying a param with shape {} from checkpoint, " - "the shape in current model is {}.".format( - key, input_param.shape, param.shape - ) - ) - continue - try: - with torch.no_grad(): - # param.copy_(input_param) - new_param = torch.nn.Parameter( - input_param, requires_grad=param.requires_grad - ) # This line is new - if name in self._parameters: # This line is new - self._parameters[name] = new_param # This line is new - if name in persistent_buffers: # This line is new - self._buffers[name] = new_param # This line is new - except Exception as ex: - error_msgs.append( - 'While copying the parameter named "{}", ' - "whose dimensions in the model are {} and " - "whose dimensions in the checkpoint are {}, " - "an exception occurred : {}.".format( - key, param.size(), input_param.size(), ex.args - ) - ) - elif strict: - missing_keys.append(key) - - extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX - if ( - hasattr(Module, "set_extra_state") - and getattr(self.__class__, "set_extra_state", Module.set_extra_state) - is not Module.set_extra_state - ): # if getattr(self.__class__, "set_extra_state", Module.set_extra_state) is not Module.set_extra_state: - if extra_state_key in state_dict: - self.set_extra_state(state_dict[extra_state_key]) - elif strict: - missing_keys.append(extra_state_key) - elif strict and (extra_state_key in state_dict): - unexpected_keys.append(extra_state_key) - - if strict: - for key in state_dict.keys(): - if key.startswith(prefix) and key != extra_state_key: - input_name = key[len(prefix) :] - input_name = input_name.split(".", 1)[ - 0 - ] # get the name of param/buffer/child - if input_name not in self._modules and input_name not in local_state: - unexpected_keys.append(key) - - def safetensors_load_tensor_independently( checkpoint_file: str, tensor_key: str, device: Any ) -> torch.Tensor: @@ -412,15 +353,18 @@ def patch_safetensors(callback): tensors = {} with safetensors.safe_open( - checkpoint_file, framework="pt", device=intermediary_device, + checkpoint_file, + framework="pt", + device=intermediary_device, ) as f: for key in f.keys(): tensors[key] = None for key in tensors.keys(): - tensors[key] = SafetensorsLazyTensor( - checkpoint_file=checkpoint_file, key=key, location=intermediary_device, + checkpoint_file=checkpoint_file, + key=key, + location=intermediary_device, ) if callback is not None: @@ -462,13 +406,14 @@ def use_lazy_load( enable=True, callback: Optional[Callable] = None, dematerialized_modules=False, - use_accelerate_init_empty_weights=False, ): if not enable: with use_custom_unpickler(RestrictedUnpickler): yield False return + begin_time = time.time() + try: old_rebuild_tensor = torch._utils._rebuild_tensor torch._utils._rebuild_tensor = _rebuild_tensor @@ -477,22 +422,30 @@ def use_lazy_load( old_torch_load = torch.load def torch_load(f, map_location=None, pickle_module=pickle, **pickle_load_args): - retval = old_torch_load( + model_dict = old_torch_load( f=f, map_location=map_location, pickle_module=pickle_module, **pickle_load_args, ) + + if f not in torch_checkpoint_file_handles: + torch_checkpoint_file_handles[f] = zipfile.ZipFile(f, "r") + + for k, v in model_dict.items(): + v.file_name = f + if callback is not None: callback( - retval, + model_dict, f=f, map_location=map_location, pickle_module=pickle_module, is_safetensors=False, **pickle_load_args, ) - return retval + + return model_dict torch.load = torch_load @@ -500,30 +453,8 @@ def use_lazy_load( patch_safetensors(callback) if dematerialized_modules: - if use_accelerate_init_empty_weights: - import accelerate - - init_empty_weights = accelerate.init_empty_weights() - init_empty_weights.__enter__() - else: - old_linear_init = torch.nn.Linear.__init__ - old_embedding_init = torch.nn.Embedding.__init__ - old_layernorm_init = torch.nn.LayerNorm.__init__ - - def linear_init(self, *args, device=None, **kwargs): - return old_linear_init(self, *args, device="meta", **kwargs) - - def embedding_init(self, *args, device=None, **kwargs): - return old_embedding_init(self, *args, device="meta", **kwargs) - - def layernorm_init(self, *args, device=None, **kwargs): - return old_layernorm_init(self, *args, device="meta", **kwargs) - - torch.nn.Linear.__init__ = linear_init - torch.nn.Embedding.__init__ = embedding_init - torch.nn.LayerNorm.__init__ = layernorm_init - old_load_from_state_dict = torch.nn.Module._load_from_state_dict - torch.nn.Module._load_from_state_dict = _load_from_state_dict + init_empty_weights = accelerate.init_empty_weights() + init_empty_weights.__enter__() with use_custom_unpickler(_LazyUnpickler): yield True @@ -531,11 +462,30 @@ def use_lazy_load( finally: torch._utils._rebuild_tensor = old_rebuild_tensor torch.load = old_torch_load + + post_load_cleanup() + logger.debug( + f"[lazy_load] Context closed in {round(time.time() - begin_time, 2)} seconds." + ) + if dematerialized_modules: - if use_accelerate_init_empty_weights: - init_empty_weights.__exit__(None, None, None) - else: - torch.nn.Linear.__init__ = old_linear_init - torch.nn.Embedding.__init__ = old_embedding_init - torch.nn.LayerNorm.__init__ = old_layernorm_init - torch.nn.Module._load_from_state_dict = old_load_from_state_dict + init_empty_weights.__exit__(None, None, None) + + +def post_load_cleanup() -> None: + """Close dangling file pointers and clear caches after the load is complete.""" + global torch_checkpoint_file_handles + + logger.debug( + f"[lazy_load] CheckpointChunkCache Hit Data: {CheckpointChunkCache.hit_data}" + ) + CheckpointChunkCache.clear(unload_model=True) + + # Bar is initialized in + # patches.patch_transformers_for_lazyload._load_state_dict_into_meta_model, + # as it has access to the state dict (for getting tensor count) + utils.bar = None + + for v in torch_checkpoint_file_handles.values(): + v.close() + torch_checkpoint_file_handles = {} diff --git a/modeling/patches.py b/modeling/patches.py index f6b1ff0a..827e997a 100644 --- a/modeling/patches.py +++ b/modeling/patches.py @@ -2,7 +2,7 @@ from __future__ import annotations import copy import requests -from typing import Iterable, List +from typing import List from tqdm.auto import tqdm import transformers @@ -10,6 +10,7 @@ from transformers import ( PreTrainedModel, modeling_utils, ) +from modeling.lazy_loader import LazyTensor import utils @@ -125,9 +126,185 @@ def patch_transformers_generation() -> None: transformers.generation.logits_process.NoBadWordsLogitsProcessor.__init__ = new_init -def patch_transformers() -> None: +def patch_transformers_for_lazyload() -> None: + """ + Most of the code is modified code from the Accelerate and Transformers + projects, made by HuggingFace. The license for these projects are as follows: + --- + Copyright The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + """ + import torch + from accelerate.utils import set_module_tensor_to_device, offload_weight + + def _load_state_dict_into_meta_model( + model, + state_dict, + loaded_state_dict_keys, + start_prefix, + expected_keys, + device_map=None, + offload_folder=None, + offload_index=None, + state_dict_folder=None, + state_dict_index=None, + dtype=None, + # PATCH: load_in_8bit was renamed to is_quantized in Transformers 4.30, keep + # both for short term compatibility + load_in_8bit=False, + is_quantized=False, + is_safetensors=False, + keep_in_fp32_modules=None, + ): + is_quantized = is_quantized or load_in_8bit + + if is_quantized: + from .utils.bitsandbytes import set_module_8bit_tensor_to_device + + error_msgs = [] + + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if "gamma" in key: + new_key = key.replace("gamma", "weight") + if "beta" in key: + new_key = key.replace("beta", "bias") + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + +# BEGIN PATCH + utils.bar = tqdm(total=len(state_dict), desc="Loading model tensors", file=utils.UIProgressBarFile(), position=1) + + for param_name, param in sorted( + state_dict.items(), + # State dict must be ordered in this manner to make the caching in + # lazy_loader.py effective + key=lambda x: ( + x[1].key, + x[1].seek_offset, + ), + ): + + if isinstance(param, LazyTensor): + # Should always be true + param = param.materialize(map_location="cpu") + utils.bar.update(1) +# END PATCH + + # First part of the test is always true as load_state_dict_keys always contains state_dict keys. + if ( + param_name not in loaded_state_dict_keys + or param_name not in expected_keys + ): + continue + + if param_name.startswith(start_prefix): + param_name = param_name[len(start_prefix) :] + + module_name = param_name + + # We convert floating dtypes to the `dtype` passed. We want to keep the buffers/params + # in int/uint/bool and not cast them. + if dtype is not None and torch.is_floating_point(param): + if ( + keep_in_fp32_modules is not None + and any( + module_to_keep_in_fp32 in param_name + for module_to_keep_in_fp32 in keep_in_fp32_modules + ) + and dtype == torch.float16 + ): + param = param.to(torch.float32) + else: + param = param.to(dtype) + + # For compatibility with PyTorch load_state_dict which converts state dict dtype to existing dtype in model + if dtype is None: + old_param = model + splits = param_name.split(".") + for split in splits: + old_param = getattr(old_param, split) + if old_param is None: + break + + if old_param is not None: + param = param.to(old_param.dtype) + + if device_map is None: + param_device = "cpu" + else: + # find next higher level module that is defined in device_map: + # bert.lm_head.weight -> bert.lm_head -> bert -> '' + while len(module_name) > 0 and module_name not in device_map: + module_name = ".".join(module_name.split(".")[:-1]) + if module_name == "" and "" not in device_map: + # TODO: group all errors and raise at the end. + raise ValueError(f"{param_name} doesn't have any device set.") + param_device = device_map[module_name] + + if param_device == "disk": + if not is_safetensors: + offload_index = offload_weight( + param, param_name, offload_folder, offload_index + ) + elif param_device == "cpu" and state_dict_index is not None: + state_dict_index = offload_weight( + param, param_name, state_dict_folder, state_dict_index + ) + elif not is_quantized: + # For backward compatibility with older versions of `accelerate` + set_module_tensor_to_device( + model, + tensor_name=param_name, + device=param_device, + value=param, + dtype=dtype, + ) + else: + if ( + param.dtype == torch.int8 + and param_name.replace("weight", "SCB") in state_dict.keys() + ): + fp16_statistics = state_dict[param_name.replace("weight", "SCB")] + else: + fp16_statistics = None + + if "SCB" not in param_name: + set_module_8bit_tensor_to_device( + model, + param_name, + param_device, + value=param, + fp16_statistics=fp16_statistics, + ) + + return error_msgs, offload_index, state_dict_index + + transformers.modeling_utils._load_state_dict_into_meta_model = ( + _load_state_dict_into_meta_model + ) + + +def patch_transformers(use_tpu: bool) -> None: patch_transformers_download() patch_transformers_loader() - # Doesn't do anything for TPU - patch_transformers_generation() + if not use_tpu: + patch_transformers_generation() + patch_transformers_for_lazyload() diff --git a/prompt_tuner.py b/prompt_tuner.py index b1cbf78d..174a2391 100644 --- a/prompt_tuner.py +++ b/prompt_tuner.py @@ -855,7 +855,7 @@ class TrainerBase(abc.ABC): lazy_load_callback.nested = False # Since we're using lazy loader, we need to figure out what the model's hidden layers are called - with lazy_loader.use_lazy_load(dematerialized_modules=True, use_accelerate_init_empty_weights=True): + with lazy_loader.use_lazy_load(dematerialized_modules=True): try: metamodel = AutoModelForCausalLM.from_config(model_config) except Exception as e: diff --git a/requirements_mtj.txt b/requirements_mtj.txt index 1f4359f6..b3521d03 100644 --- a/requirements_mtj.txt +++ b/requirements_mtj.txt @@ -1,4 +1,4 @@ -torch >= 1.9, < 1.13 +torch == 2.0 numpy tqdm requests @@ -34,3 +34,4 @@ ijson ftfy pydub sentencepiece +accelerate==0.18.0 \ No newline at end of file diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py index ec69f66d..5a5271e2 100644 --- a/tpu_mtj_backend.py +++ b/tpu_mtj_backend.py @@ -1171,139 +1171,118 @@ def load_model(path: str, model_type: str, badwordsids=koboldai_settings.badword if callback.nested: return callback.nested = True - with zipfile.ZipFile(f, "r") as z: - try: - last_storage_key = None - zipfolder = os.path.basename(os.path.normpath(f)).split('.')[0] - f = None - current_offset = 0 - if utils.current_shard == 0: - print("\n\n\nThis model has ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), " parameters.\n") - - if utils.num_shards is None or utils.current_shard == 0: - if utils.num_shards is not None: - num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs)) - else: - num_tensors = len(model_dict) - - if socketio is None: - utils.bar = tqdm(total=num_tensors, desc="Loading model tensors") - else: - utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit)) - koboldai_vars.status_message = "Loading model" - koboldai_vars.loaded_layers = 0 - koboldai_vars.total_layers = num_tensors + try: + if utils.current_shard == 0: + print("\n\n\nThis model has ", f"{hk.data_structures.tree_size(network.state['params']):,d}".replace(",", " "), " parameters.\n") + if utils.num_shards is None or utils.current_shard == 0: if utils.num_shards is not None: - utils.current_shard += 1 - for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)): - model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None) + num_tensors = len(utils.get_sharded_checkpoint_num_tensors(utils.from_pretrained_model_name, utils.from_pretrained_index_filename, **utils.from_pretrained_kwargs)) + else: + num_tensors = len(model_dict) - # Some model weights are used by transformers but not by MTJ. - # We have to materialize these weights anyways because - # transformers will throw a tantrum otherwise. To attain - # the least possible memory usage, we create them as meta - # tensors, which don't take up any actual CPU or TPU memory. - if model_spec_key is None: - model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta") - utils.bar.update(1) - koboldai_vars.loaded_layers += 1 - continue + if socketio is None: + utils.bar = tqdm(total=num_tensors, desc="Loading model tensors") + else: + utils.bar = tqdm(total=num_tensors, desc="Loading model tensors", file=utils.UIProgressBarFile(socketio.emit)) + koboldai_vars.status_message = "Loading model" + koboldai_vars.loaded_layers = 0 + koboldai_vars.total_layers = num_tensors - storage_key = model_dict[key].key - if storage_key != last_storage_key or model_dict[key].seek_offset < current_offset: - last_storage_key = storage_key - if isinstance(f, zipfile.ZipExtFile): - f.close() - try: - f = z.open(f"archive/data/{storage_key}") - except: - f = z.open(f"{zipfolder}/data/{storage_key}") - current_offset = 0 - if current_offset != model_dict[key].seek_offset: - f.read(model_dict[key].seek_offset - current_offset) - current_offset = model_dict[key].seek_offset - spec = model_spec[model_spec_key] - transforms = set(spec.get("transforms", ())) - if not isinstance(model_dict[key], lazy_loader.LazyTensor): - error = f"Duplicate key {repr(key)}" - print("\n\nERROR: " + error, file=sys.stderr) - raise RuntimeError(error) - size = functools.reduce(lambda x, y: x * y, model_dict[key].shape, 1) - dtype = model_dict[key].dtype - nbytes = size if dtype is torch.bool else size * ((torch.finfo if dtype.is_floating_point else torch.iinfo)(dtype).bits >> 3) - tensor = model_dict[key].materialize(f, map_location="cpu") - model_dict[key] = tensor.to("meta") - current_offset += nbytes + if utils.num_shards is not None: + utils.current_shard += 1 - # MTJ requires certain mathematical operations to be performed - # on tensors in order for them to be in the correct format - if "remove_first_two_rows" in transforms: - tensor = tensor[2:] - if "divide_by_shards" in transforms: - tensor /= params["cores_per_replica"] - if "vocab_pad" in transforms: - tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],)) - # We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config - #if "no_transpose" not in transforms and tensor.ndim == 2: - # tensor = tensor.T - tensor.unsqueeze_(0) - + for key in sorted(model_dict.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)): + model_spec_key = max((k for k in model_spec.keys() if key.endswith(k)), key=len, default=None) - # Shard the tensor so that parts of the tensor can be used - # on different TPU cores - tensor = reshard_reverse( - tensor, - params["cores_per_replica"], - network.state["params"][spec["module"]][spec["param"]].shape, - ) - tensor = tensor.detach() - # numpy does not support bfloat16 - if tensor.dtype is torch.bfloat16: - tensor = tensor.to(torch.float32) - tensor = jnp.array(tensor) - if tensor.dtype is torch.float16 or tensor.dtype is torch.float32: - tensor = tensor.bfloat16() - network.state["params"][spec["module"]][spec["param"]] = move_xmap( - tensor, - np.empty(params["cores_per_replica"]), - ) - - koboldai_vars.loaded_layers += 1 - try: - time.sleep(0.01) - except: - pass + # Some model weights are used by transformers but not by MTJ. + # We have to materialize these weights anyways because + # transformers will throw a tantrum otherwise. To attain + # the least possible memory usage, we create them as meta + # tensors, which don't take up any actual CPU or TPU memory. + if model_spec_key is None: + model_dict[key] = torch.empty(model_dict[key].shape, dtype=model_dict[key].dtype, device="meta") utils.bar.update(1) + koboldai_vars.loaded_layers += 1 + continue - if utils.num_shards is not None and utils.current_shard < utils.num_shards: - return + spec = model_spec[model_spec_key] + transforms = set(spec.get("transforms", ())) - # Check for tensors that MTJ needs that were not provided in the - # HF model - for mk, mv in network.state["params"].items(): - for pk, pv in mv.items(): - if isinstance(pv, PlaceholderTensor): - # The transformers GPT-J models apparently do not - # have embedding bias, whereas MTJ GPT-J models do, - # so we have to supplement an embedding bias tensor - # by creating a tensor with the necessary shape, filled - # with zeros. - if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b": - mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"])) + if not isinstance(model_dict[key], lazy_loader.LazyTensor): + error = f"Duplicate key {repr(key)}" + print("\n\nERROR: " + error, file=sys.stderr) + raise RuntimeError(error) - else: - error = f"{mk} {pk} could not be found in the model checkpoint" - print("\n\nERROR: " + error, file=sys.stderr) - raise RuntimeError(error) - finally: - if utils.num_shards is None or utils.current_shard >= utils.num_shards: - utils.bar.close() - utils.bar = None - koboldai_vars.status_message = "" - callback.nested = False - if isinstance(f, zipfile.ZipExtFile): - f.close() + tensor = model_dict[key].materialize(map_location="cpu") + model_dict[key] = tensor.to("meta") + + # MTJ requires certain mathematical operations to be performed + # on tensors in order for them to be in the correct format + if "remove_first_two_rows" in transforms: + tensor = tensor[2:] + if "divide_by_shards" in transforms: + tensor /= params["cores_per_replica"] + if "vocab_pad" in transforms: + tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],)) + # We don't need to transpose linear module weights anymore because MTJ will do it for us if `transposed_linear` is set to True in the config + #if "no_transpose" not in transforms and tensor.ndim == 2: + # tensor = tensor.T + tensor.unsqueeze_(0) + + + # Shard the tensor so that parts of the tensor can be used + # on different TPU cores + tensor = reshard_reverse( + tensor, + params["cores_per_replica"], + network.state["params"][spec["module"]][spec["param"]].shape, + ) + tensor = tensor.detach() + # numpy does not support bfloat16 + if tensor.dtype is torch.bfloat16: + tensor = tensor.to(torch.float32) + tensor = jnp.array(tensor) + if tensor.dtype is torch.float16 or tensor.dtype is torch.float32: + tensor = tensor.bfloat16() + network.state["params"][spec["module"]][spec["param"]] = move_xmap( + tensor, + np.empty(params["cores_per_replica"]), + ) + + koboldai_vars.loaded_layers += 1 + try: + time.sleep(0.01) + except: + pass + utils.bar.update(1) + + if utils.num_shards is not None and utils.current_shard < utils.num_shards: + return + + # Check for tensors that MTJ needs that were not provided in the + # HF model + for mk, mv in network.state["params"].items(): + for pk, pv in mv.items(): + if isinstance(pv, PlaceholderTensor): + # The transformers GPT-J models apparently do not + # have embedding bias, whereas MTJ GPT-J models do, + # so we have to supplement an embedding bias tensor + # by creating a tensor with the necessary shape, filled + # with zeros. + if mk == "causal_transformer_shard/~/embedding_shard/~/linear" and pk == "b": + mv[pk] = move_xmap(jnp.zeros(mv[pk].shape, dtype=jnp.bfloat16), np.empty(params["cores_per_replica"])) + + else: + error = f"{mk} {pk} could not be found in the model checkpoint" + print("\n\nERROR: " + error, file=sys.stderr) + raise RuntimeError(error) + finally: + if utils.num_shards is None or utils.current_shard >= utils.num_shards: + utils.bar.close() + utils.bar = None + koboldai_vars.status_message = "" + callback.nested = False callback.nested = False if os.path.isdir(koboldai_vars.model.replace('/', '_')): diff --git a/utils.py b/utils.py index 13ebb6a3..e7e20c95 100644 --- a/utils.py +++ b/utils.py @@ -650,17 +650,6 @@ class UIProgressBarFile(object): def flush(self): pass -def get_auxilary_device(): - """Get device auxilary tensors like inputs should be stored on.""" - - # NOTE: TPU isn't a torch device, so TPU stuff gets sent to CPU. - if koboldai_vars.hascuda and koboldai_vars.usegpu: - return koboldai_vars.gpu_device - elif koboldai_vars.hascuda and koboldai_vars.breakmodel: - import breakmodel - return breakmodel.primary_device - return "cpu" - #==================================================================# # Strips submitted text from the text returned by the AI #==================================================================#