In breakmodel mode, move layers to GPU as soon as model loads

Rather than during the first generation.
This commit is contained in:
Gnome Ann 2021-11-25 11:44:41 -05:00
parent 978dc486a5
commit f8bcc3411b
2 changed files with 36 additions and 32 deletions

View File

@ -303,6 +303,7 @@ def device_config(model):
gc.collect()
GPTNeoModel.forward = breakmodel.new_forward
generator = model.generate
breakmodel.move_hidden_layers(model.transformer)
#==================================================================#
# Startup

View File

@ -229,6 +229,41 @@ gpu_blocks = []
primary_device = 0
def move_hidden_layers(transformer):
assert len(gpu_blocks) <= torch.cuda.device_count()
assert sum(gpu_blocks) <= len(transformer.h)
ram_blocks = len(transformer.h) - sum(gpu_blocks)
transformer.extrastorage = {}
torch.cuda.empty_cache()
for i in range(ram_blocks):
transformer.h[i].to("cpu")
transformer.extrastorage[i] = copy.deepcopy(transformer.h[i])
smalltensor = torch.tensor(0).to(primary_device)
for param1 in transformer.h[i].parameters():
param1.data = smalltensor
transformer.h[i].to(primary_device)
for param in transformer.extrastorage[i].parameters():
param.requires_grad = False
param.data = param.data.detach().pin_memory()
gc.collect()
torch.cuda.empty_cache()
if ram_blocks:
for param1,param2 in zip(transformer.h[0].parameters(),transformer.extrastorage[0].parameters()):
param1.data = param2.data.to(primary_device, non_blocking=False).detach()
for param1,param2 in zip(transformer.h[ram_blocks-1].parameters(),transformer.extrastorage[ram_blocks-1].parameters()):
param1.data = param2.data.to(primary_device, non_blocking=False).detach()
i = ram_blocks
for j in range(len(gpu_blocks)):
for _ in range(gpu_blocks[j]):
transformer.h[i].to(j)
i += 1
def new_forward(
self,
input_ids=None,
@ -249,38 +284,6 @@ def new_forward(
ram_blocks = len(self.h) - sum(gpu_blocks)
cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks))
if breakmodel:
if not hasattr(self, 'extrastorage'):
setattr(self,"extrastorage",{})
torch.cuda.empty_cache()
for i in range(ram_blocks):
self.h[i].to("cpu")
self.extrastorage[i] = copy.deepcopy(self.h[i])
smalltensor = torch.tensor(0).to(primary_device)
for param1 in self.h[i].parameters():
param1.data = smalltensor
self.h[i].to(primary_device)
for param in self.extrastorage[i].parameters():
param.requires_grad = False
param.data = param.data.detach().pin_memory()
gc.collect()
torch.cuda.empty_cache()
if ram_blocks:
for param1,param2 in zip(self.h[0].parameters(),self.extrastorage[0].parameters()):
param1.data = param2.data.to(primary_device, non_blocking=False).detach()
for param1,param2 in zip(self.h[ram_blocks-1].parameters(),self.extrastorage[ram_blocks-1].parameters()):
param1.data = param2.data.to(primary_device, non_blocking=False).detach()
i = ram_blocks
for j in range(len(gpu_blocks)):
for _ in range(gpu_blocks[j]):
self.h[i].to(j)
i += 1
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (