Merge pull request #39 from VE-FORBRYDERNE/breakmodel

Official transformers 6B breakmodel support and more RAM-efficient model loading
2021-11-27 01:11:48 +01:00 · 2021-11-27 01:11:48 +01:00 · 6008d4f3a5
parent 978dc486a5 e5e2fb088a
commit 6008d4f3a5
2 changed files with 94 additions and 63 deletions
--- a/aiserver.py
+++ b/aiserver.py
@ -14,7 +14,9 @@ from tkinter import messagebox
 import json
 import collections
 import zipfile
-from typing import Union, Dict, Set, List
+import packaging
+import contextlib
+from typing import Any, Union, Dict, Set, List

 import requests
 import html
@ -298,11 +300,14 @@ def device_config(model):
    model.transformer.ln_f.to(breakmodel.primary_device)
    if(hasattr(model, 'lm_head')):
        model.lm_head.to(breakmodel.primary_device)
-    if(not hasattr(model.config, 'rotary') or not model.config.rotary):
+    if(hasattr(model.transformer, 'wpe')):
        model.transformer.wpe.to(breakmodel.primary_device)
    gc.collect()
    GPTNeoModel.forward = breakmodel.new_forward
+    if("GPTJModel" in globals()):
+        GPTJModel.forward = breakmodel.new_forward
    generator = model.generate
+    breakmodel.move_hidden_layers(model.transformer)

 #==================================================================#
 # Startup
@ -537,7 +542,12 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
    if(not vars.noai):
        print("{0}Initializing transformers, please wait...{1}".format(colors.PURPLE, colors.END))
        from transformers import StoppingCriteria, GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM
+        try:
+            from transformers import GPTJModel
+        except:
+            pass
        import transformers.generation_utils
+        from transformers import __version__ as transformers_version

        # Patch transformers to use our soft prompt
        def patch_causallm(cls):
@ -699,14 +709,31 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
                except:
                    return int(model.lm_head.in_features)
        
+        def maybe_low_cpu_mem_usage() -> Dict[str, Any]:
+            if(packaging.version.parse(transformers_version) < packaging.version.parse("4.11.0")):
+                print(f"\nWARNING:  Please upgrade to transformers 4.11.0 for lower RAM usage.  You have transformers {transformers_version}.", file=sys.stderr)
+                return {}
+            return {"low_cpu_mem_usage": True}
+        
+        @contextlib.contextmanager
+        def maybe_use_float16(always_use=False):
+            if(always_use or (vars.hascuda and (vars.usegpu or vars.breakmodel))):
+                original_dtype = torch.get_default_dtype()
+                torch.set_default_dtype(torch.float16)
+                yield True
+                torch.set_default_dtype(original_dtype)
+            else:
+                yield False
+
        # If custom GPT Neo model was chosen
        if(vars.model == "NeoCustom"):
            model_config = open(vars.custmodpth + "/config.json", "r")
            js   = json.load(model_config)
+            with(maybe_use_float16()):
                if("model_type" in js):
-                model     = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/")
+                    model     = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage())
                else:
-                model     = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/")
+                    model     = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage())
            vars.modeldim = get_hidden_size_from_model(model)
            tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, cache_dir="cache/")
            # Is CUDA available? If so, use GPU, otherwise fall back to CPU
@ -724,7 +751,8 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
        elif(vars.model == "GPT2Custom"):
            model_config = open(vars.custmodpth + "/config.json", "r")
            js   = json.load(model_config)
-            model     = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/")
+            with(maybe_use_float16()):
+                model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/", **maybe_low_cpu_mem_usage())
            tokenizer = GPT2Tokenizer.from_pretrained(vars.custmodpth, cache_dir="cache/")
            vars.modeldim = get_hidden_size_from_model(model)
            # Is CUDA available? If so, use GPU, otherwise fall back to CPU
@ -739,20 +767,22 @@ if(not vars.model in ["InferKit", "Colab", "OAI", "ReadOnly", "TPUMeshTransforme
            tokenizer = GPT2Tokenizer.from_pretrained(vars.model, cache_dir="cache/")
            if(vars.hascuda):
                if(vars.usegpu):
-                    model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
+                    with(maybe_use_float16()):
+                        model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
                    vars.modeldim = get_hidden_size_from_model(model)
                    model = model.half().to(0)
                    generator = model.generate
                elif(vars.breakmodel):  # Use both RAM and VRAM (breakmodel)
-                    model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
+                    with(maybe_use_float16()):
+                        model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
                    vars.modeldim = get_hidden_size_from_model(model)
                    device_config(model)
                else:
-                    model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
+                    model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
                    vars.modeldim = get_hidden_size_from_model(model)
                    generator = model.generate
            else:
-                model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/")
+                model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache/", **maybe_low_cpu_mem_usage())
                vars.modeldim = get_hidden_size_from_model(model)
                generator = model.generate
        
--- a/breakmodel.py
+++ b/breakmodel.py
@ -215,6 +215,7 @@ import torch
 import torch.cuda.comm
 import copy
 import gc
+import sys
 import itertools
 import bisect

@ -229,6 +230,48 @@ gpu_blocks = []
 primary_device = 0


+def move_hidden_layers(transformer):
+    assert len(gpu_blocks) <= torch.cuda.device_count()
+    assert sum(gpu_blocks) <= len(transformer.h)
+    ram_blocks = len(transformer.h) - sum(gpu_blocks)
+
+    transformer.extrastorage = {}
+    torch.cuda.empty_cache()
+    
+    able_to_pin_layers = True
+    for i in range(ram_blocks):
+        transformer.h[i].to("cpu")
+        transformer.extrastorage[i] = copy.deepcopy(transformer.h[i])
+        smalltensor = torch.tensor(0).to(primary_device)
+        for param1 in transformer.h[i].parameters():
+            param1.data = smalltensor
+        transformer.h[i].to(primary_device)
+        for param in transformer.extrastorage[i].parameters():
+            param.requires_grad = False
+            param.data = param.data.detach()
+            if able_to_pin_layers:
+                try:
+                    param.data = param.data.pin_memory()
+                except:
+                    able_to_pin_layers = False
+                    print(f"WARNING:  You only have enough shared GPU memory for {i} out of {ram_blocks} CPU layers.  Expect suboptimal speed.", file=sys.stderr)
+            gc.collect()
+            torch.cuda.empty_cache()
+
+    if ram_blocks:
+        for param1,param2 in zip(transformer.h[0].parameters(),transformer.extrastorage[0].parameters()):
+            param1.data = param2.data.to(primary_device, non_blocking=False).detach()
+
+        for param1,param2 in zip(transformer.h[ram_blocks-1].parameters(),transformer.extrastorage[ram_blocks-1].parameters()):
+            param1.data = param2.data.to(primary_device, non_blocking=False).detach()
+
+    i = ram_blocks
+    for j in range(len(gpu_blocks)):
+        for _ in range(gpu_blocks[j]):
+            transformer.h[i].to(j)
+            i += 1
+
+
 def new_forward(
        self,
        input_ids=None,
@ -249,38 +292,6 @@ def new_forward(
    ram_blocks = len(self.h) - sum(gpu_blocks)
    cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))

-    if breakmodel:
-        if not hasattr(self, 'extrastorage'):
-            setattr(self,"extrastorage",{})
-            torch.cuda.empty_cache()
-
-            for i in range(ram_blocks):
-                self.h[i].to("cpu")
-                self.extrastorage[i] = copy.deepcopy(self.h[i])
-                smalltensor = torch.tensor(0).to(primary_device)
-                for param1 in self.h[i].parameters():
-                    param1.data = smalltensor
-                self.h[i].to(primary_device)
-                for param in self.extrastorage[i].parameters():
-                    param.requires_grad = False
-                    param.data = param.data.detach().pin_memory()
-                    gc.collect()
-                    torch.cuda.empty_cache()
-
-            if ram_blocks:
-                for param1,param2 in zip(self.h[0].parameters(),self.extrastorage[0].parameters()):
-                    param1.data = param2.data.to(primary_device, non_blocking=False).detach()
-
-                for param1,param2 in zip(self.h[ram_blocks-1].parameters(),self.extrastorage[ram_blocks-1].parameters()):
-                    param1.data = param2.data.to(primary_device, non_blocking=False).detach()
-
-            i = ram_blocks
-            for j in range(len(gpu_blocks)):
-                for _ in range(gpu_blocks[j]):
-                    self.h[i].to(j)
-                    i += 1
-
-

    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
@ -322,33 +333,27 @@ def new_forward(
    # Attention mask.
    if attention_mask is not None:
        assert batch_size > 0, "batch_size has to be defined and > 0"
-        global_attention_mask = attention_mask.view(batch_size, -1)
+        attention_mask = attention_mask.view(batch_size, -1)
        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        global_attention_mask = global_attention_mask[:, None, None, :]
+        attention_mask = attention_mask[:, None, None, :]

-        # Since global_attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
-        global_attention_mask = global_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-        global_attention_mask = (1.0 - global_attention_mask) * -10000.0
-    else:
-        global_attention_mask = None
-
-    # Local causal attention mask
-    batch_size, seq_length = input_shape
-    full_seq_length = seq_length + past_length
+        attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        attention_mask = (1.0 - attention_mask) * -10000.0

    # Prepare head mask if needed
    # 1.0 in head_mask indicate we keep the head
    # attention_probs has shape bsz x num_heads x N x N
    # head_mask has shape n_layer x batch x num_heads x N x N
-    head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+    head_mask = self.get_head_mask(head_mask, getattr(self.config, "num_layers", None) or self.config.n_layer)

    if inputs_embeds is None:
        if breakmodel:
@ -364,7 +369,7 @@ def new_forward(
            inputs_embeds[:, pos:pos+emb.shape[1]] = emb
            offset += emb.shape[1]

-    if hasattr(self, 'rotary') and self.rotary:
+    if getattr(self, "wpe", None) is None:
        hidden_states = inputs_embeds
    else:
        if breakmodel:
@ -400,9 +405,6 @@ def new_forward(
                    with torch.cuda.stream(copystream):
                        torch.cuda.comm.broadcast(param2.data,out = [param1.data])

-        attn_type = self.config.attention_layers[i]
-        attn_mask = global_attention_mask
-
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states.cpu(),)

@ -410,8 +412,7 @@ def new_forward(

            if use_cache:
                logger.warning(
-                    "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
-                    "`use_cache=False`..."
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

@ -426,7 +427,7 @@ def new_forward(
                create_custom_forward(block),
                hidden_states,
                None,
-                attn_mask,
+                attention_mask,
                head_mask[i],
            )
        else:
@ -435,7 +436,7 @@ def new_forward(
            outputs = block(
                hidden_states.to(device) if breakmodel and hidden_states is not None else hidden_states,
                layer_past=tuple(v.to(device) for v in layer_past if v is not None) if breakmodel and layer_past is not None and i >= ram_blocks and len(layer_past) and layer_past[0].device.index != device else layer_past,
-                attention_mask=attn_mask.to(device) if breakmodel and attn_mask is not None else attn_mask,
+                attention_mask=attention_mask.to(device) if breakmodel and attention_mask is not None else attention_mask,
                head_mask=head_mask[i].to(device) if breakmodel and head_mask[i] is not None else head_mask[i],
                use_cache=use_cache,
                output_attentions=output_attentions,