Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions explorations/identity_first_layer_mlp_sweep.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# identity_first_layer_mlp_sweep.yaml
# Explore per-layer MLP sizes when the first block uses identity attention.
# The first transformer block skips attention by using the "identity" variant,
# while later blocks retain standard causal self-attention. We sweep a handful
# of first-layer MLP expansion sizes while keeping the remaining blocks at the
# default 4x hidden size.

# Base hyper-parameters shared by every run
block_size: [256]
n_layer: [4]
n_head: [8]
n_embd: [512]
mlp_size: [2048] # default 4x expansion for reference layers
dataset: ["shakespeare_char"]
device: ["cuda"]
dtype: ["bfloat16"]
compile: [true]

# Per-layer attention configuration – identity only on the first block
attention_variant_layerlist:
- ["identity", "causal", "causal", "causal"]

# Sweep different first-layer MLP widths while later layers stay at 2048
parameter_groups:
- mlp_size_layerlist:
- [512, 2048, 2048, 2048]
- [1024, 2048, 2048, 2048]
- [1536, 2048, 2048, 2048]
- [2048, 2048, 2048, 2048]
- [2560, 2048, 2048, 2048]