From f8bcc3411b9104ccb04fbc32df156770d03f9cdf Mon Sep 17 00:00:00 2001 From: Gnome Ann <> Date: Thu, 25 Nov 2021 11:44:41 -0500 Subject: [PATCH] In breakmodel mode, move layers to GPU as soon as model loads Rather than during the first generation. --- aiserver.py | 1 + breakmodel.py | 67 +++++++++++++++++++++++++++------------------------ 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/aiserver.py b/aiserver.py index 1d02dd67..c550fc2c 100644 --- a/aiserver.py +++ b/aiserver.py @@ -303,6 +303,7 @@ def device_config(model): gc.collect() GPTNeoModel.forward = breakmodel.new_forward generator = model.generate + breakmodel.move_hidden_layers(model.transformer) #==================================================================# # Startup diff --git a/breakmodel.py b/breakmodel.py index 5724d4e2..e1f3ce6e 100644 --- a/breakmodel.py +++ b/breakmodel.py @@ -229,6 +229,41 @@ gpu_blocks = [] primary_device = 0 +def move_hidden_layers(transformer): + assert len(gpu_blocks) <= torch.cuda.device_count() + assert sum(gpu_blocks) <= len(transformer.h) + ram_blocks = len(transformer.h) - sum(gpu_blocks) + + transformer.extrastorage = {} + torch.cuda.empty_cache() + + for i in range(ram_blocks): + transformer.h[i].to("cpu") + transformer.extrastorage[i] = copy.deepcopy(transformer.h[i]) + smalltensor = torch.tensor(0).to(primary_device) + for param1 in transformer.h[i].parameters(): + param1.data = smalltensor + transformer.h[i].to(primary_device) + for param in transformer.extrastorage[i].parameters(): + param.requires_grad = False + param.data = param.data.detach().pin_memory() + gc.collect() + torch.cuda.empty_cache() + + if ram_blocks: + for param1,param2 in zip(transformer.h[0].parameters(),transformer.extrastorage[0].parameters()): + param1.data = param2.data.to(primary_device, non_blocking=False).detach() + + for param1,param2 in zip(transformer.h[ram_blocks-1].parameters(),transformer.extrastorage[ram_blocks-1].parameters()): + param1.data = param2.data.to(primary_device, non_blocking=False).detach() + + i = ram_blocks + for j in range(len(gpu_blocks)): + for _ in range(gpu_blocks[j]): + transformer.h[i].to(j) + i += 1 + + def new_forward( self, input_ids=None, @@ -249,38 +284,6 @@ def new_forward( ram_blocks = len(self.h) - sum(gpu_blocks) cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) - if breakmodel: - if not hasattr(self, 'extrastorage'): - setattr(self,"extrastorage",{}) - torch.cuda.empty_cache() - - for i in range(ram_blocks): - self.h[i].to("cpu") - self.extrastorage[i] = copy.deepcopy(self.h[i]) - smalltensor = torch.tensor(0).to(primary_device) - for param1 in self.h[i].parameters(): - param1.data = smalltensor - self.h[i].to(primary_device) - for param in self.extrastorage[i].parameters(): - param.requires_grad = False - param.data = param.data.detach().pin_memory() - gc.collect() - torch.cuda.empty_cache() - - if ram_blocks: - for param1,param2 in zip(self.h[0].parameters(),self.extrastorage[0].parameters()): - param1.data = param2.data.to(primary_device, non_blocking=False).detach() - - for param1,param2 in zip(self.h[ram_blocks-1].parameters(),self.extrastorage[ram_blocks-1].parameters()): - param1.data = param2.data.to(primary_device, non_blocking=False).detach() - - i = ram_blocks - for j in range(len(gpu_blocks)): - for _ in range(gpu_blocks[j]): - self.h[i].to(j) - i += 1 - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (