Add TPU support for OPT-350M

The 350M model seems to have a different structure than the other ones ???
2025-06-05 21:59:24 +02:00 · 2022-05-12 22:21:15 -04:00
parent dfa2aa7314
commit 4fa5f1cd6a
3 changed files with 9 additions and 6 deletions
--- a/maps/opt.json
+++ b/maps/opt.json
@@ -3,13 +3,16 @@
  "mtj_pe": "fixed",
  "mtj_config_map": {
    "do_layer_norm_before": ["do_layer_norm_before", true],
+    "d_embed": "word_embed_proj_dim",
    "d_model": "hidden_size",
    "n_heads": "num_attention_heads",
    "layers": "num_hidden_layers"
  },
  "static_weights": {
    "decoder.embed_tokens.weight": {"mtj": {"module": "embedding_shard/~/linear", "param": "w", "transforms": ["no_transpose", "vocab_pad"]}},
-    "decoder.embed_positions.weight": {"mtj": {"module": "embedding_shard", "param": "pos_embs", "transforms": ["no_transpose", "remove_first_two_rows"]}}
+    "decoder.project_in.weight": {"mtj": {"module": "embedding_shard", "param": "project_in"}},
+    "decoder.embed_positions.weight": {"mtj": {"module": "embedding_shard", "param": "pos_embs", "transforms": ["no_transpose", "remove_first_two_rows"]}},
+    "decoder.project_out.weight": {"mtj": {"module": "projection_shard", "param": "project_out"}}
  },
  "layer_weights": {
    "decoder.layers.{layer}.self_attn.q_proj.weight": {"mtj": {"module": "layer_{layer}/~/linear", "param": "w"}},