allenai · AkshitaB · Feb 21, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 21, 2024
diff --git a/.gitignore b/.gitignore
@@ -33,6 +33,10 @@ doc/_build/
 .DS_Store
 
 
+# mup artifacts
+
+#*.bsh
+
 # python
 
 *.pyc

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Added config options for `model.norm_after`, `model.scale_emb_init`, and `auxiliary_loss_multiplier` (used with zloss).
 - Added scripts for running experiments on qk_norm, norm reordering, and zloss.
+- Added `mup` implementation for OLMo.
 
 ### Changed
 

diff --git a/configs/mup/base-olmo-cosine.yaml b/configs/mup/base-olmo-cosine.yaml
@@ -0,0 +1,134 @@
+run_name: base-olmo
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+  project: olmo-mup
+
+model:
+  use_mup: false  # set in the calling script
+  mup_query_zero_init: true
+  d_model: 128
+  n_heads: 2
+  n_layers: 2
+  mlp_ratio: 1
+  weight_tying: false
+  alibi: false
+  rope: true
+  flash_attention: false
+  attention_dropout: 0.0
+  attention_layer_norm: false
+  clip_qkv: null
+  include_bias: false
+  block_type: sequential
+  layer_norm_type: rms
+  layer_norm_with_affine: true
+  layer_norm_eps: 1e-6
+  bias_for_layer_norm: false
+  attention_layer_norm_with_affine: false
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 4096
+  vocab_size: 50280
+  embedding_size: 50304
+  eos_token_id: 0
+  pad_token_id: 1
+  init_device: cuda
+  init_fn: normal
+  init_std: 0.02
+  init_cutoff_factor: 3
+
+ddp:
+  grad_sync_mode: batch
+  find_unused_params: false
+
+compile: null
+
+optimizer:
+  name: adamw
+  learning_rate: 1.0e-3
+  weight_decay: 0.1
+  eps: 1e-8
+  decay_norm_and_bias: true
+  decay_embeddings: false
+  betas:
+  - 0.9
+  - 0.95
+  metrics_log_interval: 10
+
+scheduler:
+  name: cosine_with_warmup
+  # t_warmup: 3 * ${model.d_model} / 128  # assuming current model size (128 = 13M)
+  alpha_f: 0.01
+  warmup_min_lr: 0.0
+
+tokenizer:
+  identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json
+  truncate_direction: right
+
+save_folder: workspace/${run_name}  # doesn't matter since we'll upload to S3
+remote_save_folder: s3://ai2-llm/checkpoints/olmo-mup/${run_name}
+save_overwrite: false
+
+# Unsharded checkpoints (for ddp)
+save_interval_unsharded: 5000
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+max_duration: 1ep
+stop_at: 10000
+global_train_batch_size: 1024
+device_train_microbatch_size: 16
+
+precision: amp_bf16
+distributed_strategy: ddp
+
+gen1_gc_interval: 1
+
+max_grad_norm: 1.0
+max_grad_norm_ratio: null
+
+speed_monitor:
+  window_size: 20
+
+eval_interval: 5000
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
+evaluators:
+  - label: all-small-ppl-validation
+    data:
+      num_workers: 0
+      drop_last: true
+      datasets:
+        wikitext_103-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy
+  ##########################
+  # Downstream evaluations #
+  ##########################
+
+  - label: hellaswag
+    type: downstream
+
+
+data:
+  pad_direction: right
+  num_workers: 32
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 8
+  persistent_workers: true
+  timeout: 0
+  instance_filter:
+    repetition_max_period: 13
+    repetition_min_period: 1
+    repetition_max_count: 32
+  paths:
+    ######### NON WEB DATA #########
+    # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy
diff --git a/configs/mup/base-olmo.yaml b/configs/mup/base-olmo.yaml
@@ -0,0 +1,134 @@
+run_name: base-olmo
+seed: 6198
+dry_run: false
+
+wandb:
+  name: ${run_name}
+  project: olmo-mup
+
+model:
+  use_mup: false  # set in the calling script
+  mup_query_zero_init: true
+  d_model: 128
+  n_heads: 2
+  n_layers: 2
+  mlp_ratio: 1
+  weight_tying: false
+  alibi: false
+  rope: true
+  flash_attention: false
+  attention_dropout: 0.0
+  attention_layer_norm: false
+  clip_qkv: null
+  include_bias: false
+  block_type: sequential
+  layer_norm_type: rms
+  layer_norm_with_affine: true
+  layer_norm_eps: 1e-6
+  bias_for_layer_norm: false
+  attention_layer_norm_with_affine: false
+  activation_type: swiglu
+  residual_dropout: 0.0
+  embedding_dropout: 0.0
+  max_sequence_length: 4096
+  vocab_size: 50280
+  embedding_size: 50304
+  eos_token_id: 0
+  pad_token_id: 1
+  init_device: cuda
+  init_fn: normal
+  init_std: 0.02
+  init_cutoff_factor: 3
+
+ddp:
+  grad_sync_mode: batch
+  find_unused_params: false
+
+compile: null
+
+optimizer:
+  name: adamw
+  learning_rate: 1.0e-3
+  weight_decay: 0.1
+  eps: 1e-8
+  decay_norm_and_bias: true
+  decay_embeddings: false
+  betas:
+  - 0.9
+  - 0.95
+  metrics_log_interval: 10
+
+scheduler:
+  name: constant #linear_with_warmup
+  #t_warmup: 10
+  alpha_f: 0.1
+  #warmup_min_lr: 0
+
+tokenizer:
+  identifier: tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json
+  truncate_direction: right
+
+save_folder: workspace/${run_name}  # doesn't matter since we'll upload to S3
+remote_save_folder: s3://ai2-llm/checkpoints/olmo-mup/${run_name}
+save_overwrite: false
+
+# Unsharded checkpoints (for ddp)
+save_interval_unsharded: 5000
+save_num_unsharded_checkpoints_to_keep: -1
+
+load_path: null
+
+max_duration: 1ep
+stop_at: 10000
+global_train_batch_size: 1024
+device_train_microbatch_size: 16
+
+precision: amp_bf16
+distributed_strategy: ddp
+
+gen1_gc_interval: 1
+
+max_grad_norm: 1.0
+max_grad_norm_ratio: null
+
+speed_monitor:
+  window_size: 20
+
+eval_interval: 5000
+eval_subset_num_batches: -1
+device_eval_batch_size: ${device_train_microbatch_size}
+evaluators:
+  - label: all-small-ppl-validation
+    data:
+      num_workers: 0
+      drop_last: true
+      datasets:
+        wikitext_103-validation:
+          - s3://ai2-llm/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy
+  ##########################
+  # Downstream evaluations #
+  ##########################
+
+  - label: hellaswag
+    type: downstream
+
+
+data:
+  pad_direction: right
+  num_workers: 32
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 8
+  persistent_workers: true
+  timeout: 0
+  instance_filter:
+    repetition_max_period: 13
+    repetition_min_period: 1
+    repetition_max_count: 32
+  paths:
+    ######### NON WEB DATA #########
+    # ~> WIKIPEDIA & WIKIBOOKS (3.689 GT), repeated twice to up-sample
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy
+    - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/wiki/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy
-Original file line number
+Diff line change
@@ Expand Up / @@ -33,6 +33,10 @@ doc/_build/ @@
     .DS_Store
+    # mup artifacts
+    #*.bsh
     # python
     *.pyc
@@ Expand Down @@