From b20d80ca2a9f13908202d8479bf901b383aeae2b Mon Sep 17 00:00:00 2001 From: vfbd Date: Wed, 2 Nov 2022 19:02:09 -0400 Subject: [PATCH] Add vocab padding to embedding bias in gptj.json --- maps/gptj.json | 4 ++-- tpu_mtj_backend.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/maps/gptj.json b/maps/gptj.json index 8e0bc9da..08b22130 100644 --- a/maps/gptj.json +++ b/maps/gptj.json @@ -9,11 +9,11 @@ }, "static_weights": { "transformer.wte.weight": {"mtj": {"module": "embedding_shard/~/linear", "param": "w", "transforms": ["no_transpose", "vocab_pad"]}}, - "transformer.wte.bias": {"mtj": {"module": "embedding_shard/~/linear", "param": "b"}}, + "transformer.wte.bias": {"mtj": {"module": "embedding_shard/~/linear", "param": "b", "transforms": ["vocab_pad"]}}, "transformer.ln_f.weight": {"mtj": {"module": "projection_shard/~/replicated_layer_norm", "param": "scale"}}, "transformer.ln_f.bias": {"mtj": {"module": "projection_shard/~/replicated_layer_norm", "param": "offset"}}, "lm_head.weight": {"mtj": {"module": "projection_shard/~/linear", "param": "w", "transforms": ["vocab_pad"]}}, - "lm_head.bias": {"mtj": {"module": "projection_shard/~/linear", "param": "b"}} + "lm_head.bias": {"mtj": {"module": "projection_shard/~/linear", "param": "b", "transforms": ["vocab_pad"]}} }, "layer_weights": { "transformer.h.{layer}.attn.bias": {}, diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py index d992ba45..64484393 100644 --- a/tpu_mtj_backend.py +++ b/tpu_mtj_backend.py @@ -1304,7 +1304,7 @@ def load_model(path: str, driver_version="tpu_driver0.1_dev20210607", hf_checkpo if "divide_by_shards" in transforms: tensor /= params["cores_per_replica"] if "vocab_pad" in transforms: - tensor = torch.nn.functional.pad(tensor, (0, 0, 0, params["n_vocab_padding"])) + tensor = torch.nn.functional.pad(tensor, (0,) * (tensor.ndim * 2 - 1) + (params["n_vocab_padding"],)) if "no_transpose" not in transforms and tensor.ndim == 2: tensor = tensor.T tensor.unsqueeze_(0)