diff --git a/Dockerfile b/Dockerfile
index dd6b95a97..eca45c7bc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -91,6 +91,7 @@ RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url h
 RUN pip install packaging
 RUN pip install flash-attn==2.6.3 --no-build-isolation
 RUN pip install -r requirements.txt
+RUN pip install git+https://github.com/arcee-ai/mergekit.git
 
 # NLTK download
 RUN python -m nltk.downloader punkt
diff --git a/configs/beaker_configs/default_dpo.yaml b/configs/beaker_configs/default_dpo.yaml
index 87685b5fe..08eacd1a7 100644
--- a/configs/beaker_configs/default_dpo.yaml
+++ b/configs/beaker_configs/default_dpo.yaml
@@ -8,7 +8,7 @@ tasks:
     command: [
       '/bin/sh', '-c'
     ]
-    arguments: ['PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
+    arguments: ['pip install --upgrade transformers && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
       --mixed_precision bf16
       --num_machines 1
       --num_processes 4
@@ -37,7 +37,7 @@ tasks:
       - name: TRANSFORMERS_CACHE
         value: ./cache/
       - name: WANDB_API_KEY
-        secret: WANDB_API_KEY
+        secret: jacobm_WANDB_API_KEY
       - name: WANDB_PROJECT
         value: open-instruct
       - name: WANDB_WATCH
@@ -47,7 +47,7 @@ tasks:
       - name: WANDB_DISABLED
         value: true
       - name: HF_TOKEN
-        secret: HF_TOKEN
+        secret: jacobm_HF_TOKEN
     datasets:
       - mountPath: /oe-adapt-default
         source:
diff --git a/configs/beaker_configs/default_eval.yaml b/configs/beaker_configs/default_eval.yaml
index 3b4553ed6..ba9f55731 100644
--- a/configs/beaker_configs/default_eval.yaml
+++ b/configs/beaker_configs/default_eval.yaml
@@ -35,16 +35,19 @@ tasks:
       - name: WANDB_DISABLED
         value: true
       - name: OPENAI_API_KEY
-        secret: openai_api_key
+        secret: jacobm_OPENAI_API_KEY
       - name: HF_TOKEN
-        secret: HF_TOKEN
+        secret: jacobm_HF_TOKEN
     datasets:
-      - mountPath: /data/
+      - mountPath: /oe-adapt-default
         source:
-          beaker: hamishivi/open-instruct-eval-data
+          weka: oe-adapt-default
       - mountPath: /model
         source:
           beaker: 01GVYXDGJC6DV0JW9JZ16YM07G
+      - mountPath: /data/
+        source:
+          beaker: hamishivi/open-instruct-eval-data
       - mountPath: /net/nfs.cirrascale
         source:
           hostPath: /net/nfs.cirrascale
diff --git a/configs/beaker_configs/default_finetune.yaml b/configs/beaker_configs/default_finetune.yaml
index bd5e05c06..d10680de5 100644
--- a/configs/beaker_configs/default_finetune.yaml
+++ b/configs/beaker_configs/default_finetune.yaml
@@ -37,7 +37,7 @@ tasks:
       - name: TRANSFORMERS_CACHE
         value: ./cache/
       - name: WANDB_API_KEY
-        secret: WANDB_API_KEY
+        secret: jacobm_WANDB_API_KEY
       - name: WANDB_PROJECT
         value: open-instruct
       - name: WANDB_WATCH
@@ -47,11 +47,14 @@ tasks:
       - name: WANDB_DISABLED
         value: true
       - name: HF_TOKEN
-        secret: HF_TOKEN
+        secret: jacobm_HF_TOKEN
     datasets:
       - mountPath: /oe-adapt-default
         source:
           weka: oe-adapt-default
+      - mountPath: /oe-training-default
+        source:
+          weka: oe-training-default
     result:
       path: /output
     resources:
diff --git a/configs/beaker_configs/default_finetune_multinode.yaml b/configs/beaker_configs/default_finetune_multinode.yaml
index 03ed976af..7b10fa8c7 100644
--- a/configs/beaker_configs/default_finetune_multinode.yaml
+++ b/configs/beaker_configs/default_finetune_multinode.yaml
@@ -51,7 +51,7 @@ tasks:
       - name: TRANSFORMERS_CACHE
         value: ./cache/
       - name: WANDB_API_KEY
-        secret: WANDB_API_KEY
+        secret: jacobm_WANDB_API_KEY
       - name: WANDB_PROJECT
         value: open-instruct
       - name: WANDB_WATCH
@@ -61,11 +61,14 @@ tasks:
       - name: WANDB_DISABLED
         value: true
       - name: HF_TOKEN
-        secret: HF_TOKEN
+        secret: jacobm_HF_TOKEN
     datasets:
       - mountPath: /oe-adapt-default
         source:
           weka: oe-adapt-default
+      # - mountPath: /model
+        # source:
+          # beaker: jacobm/llama-3.1-8b
     result:
       path: /output
     resources:
diff --git a/configs/beaker_configs/default_finetune_multinode_augusta.yaml b/configs/beaker_configs/default_finetune_multinode_augusta.yaml
new file mode 100644
index 000000000..3766bdc4b
--- /dev/null
+++ b/configs/beaker_configs/default_finetune_multinode_augusta.yaml
@@ -0,0 +1,128 @@
+version: v2
+description: open-instruct-finetune-multinode
+budget: ai2/oe-adapt
+tasks:
+  - name: open-instruct-finetune-multinode
+    replicas: 4
+    leaderSelection: true
+    hostNetworking: true
+    propagateFailure: true
+    propagatePreemption: true
+    synchronizedStartTimeout: 60m
+    image:
+      beaker: nathanl/open_instruct_auto
+    command: [
+      '/bin/sh', '-c'
+    ]
+    arguments: ['
+        unset CUDA_LAUNCH_BLOCKING && export LD_LIBRARY_PATH=/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH} && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
+        --mixed_precision bf16
+        --num_machines 4
+        --num_processes 32
+        --machine_rank $BEAKER_REPLICA_RANK
+        --main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME
+        --main_process_port 29400
+        --use_deepspeed
+        --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf
+        --deepspeed_multinode_launcher standard
+        open_instruct/finetune.py
+        --model_name_or_path meta-llama/Meta-Llama-3-8B
+        --tokenizer_name meta-llama/Meta-Llama-3-8B
+        --use_slow_tokenizer
+        --use_flash_attn
+        --max_seq_length 4096 
+        --preprocessing_num_workers 16
+        --per_device_train_batch_size 1
+        --gradient_accumulation_steps 4
+        --learning_rate 5e-6
+        --lr_scheduler_type linear
+        --warmup_ratio 0.03
+        --weight_decay 0.
+        --num_train_epochs 2
+        --output_dir /output/
+        --with_tracking
+        --report_to tensorboard
+        --logging_steps 1
+        --reduce_loss sum
+    ']
+    envVars:
+      - name: CUDA_DEVICE_ORDER
+        value: PCI_BUS_ID
+      - name: TRANSFORMERS_CACHE
+        value: ./cache/
+      - name: WANDB_API_KEY
+        secret: jacobm_WANDB_API_KEY
+      - name: WANDB_PROJECT
+        value: open-instruct
+      - name: WANDB_WATCH
+        value: false
+      - name: WANDB_LOG_MODEL
+        value: false
+      - name: WANDB_DISABLED
+        value: true
+      - name: HF_TOKEN
+        secret: jacobm_HF_TOKEN
+      - name: NCCL_CROSS_NIC
+        value: 0
+      - name: NCCL_ALGO
+        value: Ring,Tree
+      - name: NCCL_PROTO
+        value: Simple
+      - name: NCCL_MIN_NCHANNELS
+        value: 4
+      - name: NCCL_P2P_NET_CHUNKSIZE
+        value: 524288
+      - name: NCCL_P2P_PCI_CHUNKSIZE
+        value: 524288
+      - name: NCCL_P2P_NVL_CHUNKSIZE
+        value: 1048576
+      - name: NCCL_FASTRAK_NUM_FLOWS
+        value: 2
+      - name: NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL
+        value: 0
+      - name: NCCL_BUFFSIZE
+        value: 8388608
+      - name: NCCL_FASTRAK_USE_SNAP
+        value: 1
+      - name: CUDA_VISIBLE_DEVICES
+        value: 0,1,2,3,4,5,6,7
+      - name: NCCL_NET_GDR_LEVEL
+        value: PIX
+      - name: NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING
+        value: 0
+      - name: NCCL_TUNER_PLUGIN
+        value: libnccl-tuner.so
+      - name: NCCL_TUNER_CONFIG_PATH
+        value: /var/lib/tcpxo/lib64/a3plus_tuner_config.textproto
+      - name: NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE
+        value: /var/lib/tcpxo/lib64/a3plus_guest_config.textproto
+      - name: NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS
+        value: 600000
+      - name: NCCL_NVLS_ENABLE
+        value: 0
+      - name: NCCL_DEBUG
+        value: WARN
+      - name: NCCL_FASTRAK_CTRL_DEV
+        value: enp0s12
+      - name: NCCL_FASTRAK_IFNAME
+        value: enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0
+      - name: NCCL_SOCKET_IFNAME
+        value: enp0s12
+      - name: NCCL_USE_SNAP
+        value: 1
+      - name: NCCL_FASTRAK_USE_LLCM
+        value: 1
+      - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
+        value: /dev/aperture_devices
+
+    datasets:
+      - mountPath: /oe-adapt-default
+        source:
+          weka: oe-adapt-default
+    result:
+      path: /output
+    resources:
+      gpuCount: 8
+    context:
+      priority: normal
+      preemptible: true
\ No newline at end of file
diff --git a/configs/beaker_configs/default_finetune_multinode_olmo.yaml b/configs/beaker_configs/default_finetune_multinode_olmo.yaml
new file mode 100644
index 000000000..b42800cee
--- /dev/null
+++ b/configs/beaker_configs/default_finetune_multinode_olmo.yaml
@@ -0,0 +1,78 @@
+version: v2
+description: open-instruct-finetune-multinode
+budget: ai2/oe-adapt
+tasks:
+  - name: open-instruct-finetune-multinode
+    replicas: 4
+    leaderSelection: true
+    hostNetworking: true
+    propagateFailure: true
+    propagatePreemption: true
+    synchronizedStartTimeout: 60m
+    image:
+      beaker: nathanl/open_instruct_auto
+    command: [
+      '/bin/sh', '-c'
+    ]
+    arguments: ['
+        unset CUDA_LAUNCH_BLOCKING && pip install git+https://github.com/vwxyzjn/transformers.git@olmo1124_classification && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
+        --mixed_precision bf16
+        --num_machines 4
+        --num_processes 32
+        --machine_rank $BEAKER_REPLICA_RANK
+        --main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME
+        --main_process_port 29400
+        --use_deepspeed
+        --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf
+        --deepspeed_multinode_launcher standard
+        open_instruct/finetune.py
+        --model_name_or_path meta-llama/Meta-Llama-3-8B
+        --tokenizer_name meta-llama/Meta-Llama-3-8B
+        --use_slow_tokenizer
+        --use_flash_attn
+        --max_seq_length 4096 
+        --preprocessing_num_workers 16
+        --per_device_train_batch_size 1
+        --gradient_accumulation_steps 4
+        --learning_rate 5e-6
+        --lr_scheduler_type linear
+        --warmup_ratio 0.03
+        --weight_decay 0.
+        --num_train_epochs 2
+        --output_dir /output/
+        --with_tracking
+        --report_to tensorboard
+        --logging_steps 1
+        --reduce_loss sum
+    ']
+    envVars:
+      - name: CUDA_DEVICE_ORDER
+        value: PCI_BUS_ID
+      - name: TRANSFORMERS_CACHE
+        value: ./cache/
+      - name: WANDB_API_KEY
+        secret: jacobm_WANDB_API_KEY
+      - name: WANDB_PROJECT
+        value: open-instruct
+      - name: WANDB_WATCH
+        value: false
+      - name: WANDB_LOG_MODEL
+        value: false
+      - name: WANDB_DISABLED
+        value: true
+      - name: HF_TOKEN
+        secret: jacobm_HF_TOKEN
+    datasets:
+      - mountPath: /oe-adapt-default
+        source:
+          weka: oe-adapt-default
+      # - mountPath: /model
+      #   source:
+      #     beaker: jacobm/llama-3.1-8b
+    result:
+      path: /output
+    resources:
+      gpuCount: 8
+    context:
+      priority: normal
+      preemptible: true
\ No newline at end of file
diff --git a/configs/beaker_configs/default_finetune_multinode_olmoe.yaml b/configs/beaker_configs/default_finetune_multinode_olmoe.yaml
new file mode 100644
index 000000000..771dc41b7
--- /dev/null
+++ b/configs/beaker_configs/default_finetune_multinode_olmoe.yaml
@@ -0,0 +1,78 @@
+version: v2
+description: open-instruct-finetune-multinode
+budget: ai2/oe-adapt
+tasks:
+  - name: open-instruct-finetune-multinode
+    replicas: 4
+    leaderSelection: true
+    hostNetworking: true
+    propagateFailure: true
+    propagatePreemption: true
+    synchronizedStartTimeout: 60m
+    image:
+      beaker: nathanl/open_instruct_auto
+    command: [
+      '/bin/sh', '-c'
+    ]
+    arguments: ['
+        unset CUDA_LAUNCH_BLOCKING && pip install --upgrade transformers && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
+        --mixed_precision bf16
+        --num_machines 4
+        --num_processes 32
+        --machine_rank $BEAKER_REPLICA_RANK
+        --main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME
+        --main_process_port 29400
+        --use_deepspeed
+        --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf
+        --deepspeed_multinode_launcher standard
+        open_instruct/finetune.py
+        --model_name_or_path meta-llama/Meta-Llama-3-8B
+        --tokenizer_name meta-llama/Meta-Llama-3-8B
+        --use_slow_tokenizer
+        --use_flash_attn
+        --max_seq_length 4096 
+        --preprocessing_num_workers 16
+        --per_device_train_batch_size 1
+        --gradient_accumulation_steps 4
+        --learning_rate 5e-6
+        --lr_scheduler_type linear
+        --warmup_ratio 0.03
+        --weight_decay 0.
+        --num_train_epochs 2
+        --output_dir /output/
+        --with_tracking
+        --report_to tensorboard
+        --logging_steps 1
+        --reduce_loss sum
+    ']
+    envVars:
+      - name: CUDA_DEVICE_ORDER
+        value: PCI_BUS_ID
+      - name: TRANSFORMERS_CACHE
+        value: ./cache/
+      - name: WANDB_API_KEY
+        secret: jacobm_WANDB_API_KEY
+      - name: WANDB_PROJECT
+        value: open-instruct
+      - name: WANDB_WATCH
+        value: false
+      - name: WANDB_LOG_MODEL
+        value: false
+      - name: WANDB_DISABLED
+        value: true
+      - name: HF_TOKEN
+        secret: jacobm_HF_TOKEN
+    datasets:
+      - mountPath: /oe-adapt-default
+        source:
+          weka: oe-adapt-default
+      # - mountPath: /model
+      #   source:
+      #     beaker: jacobm/llama-3.1-8b
+    result:
+      path: /output
+    resources:
+      gpuCount: 8
+    context:
+      priority: normal
+      preemptible: true
\ No newline at end of file
diff --git a/configs/beaker_configs/default_finetune_olmo.yaml b/configs/beaker_configs/default_finetune_olmo.yaml
new file mode 100644
index 000000000..7a3236d17
--- /dev/null
+++ b/configs/beaker_configs/default_finetune_olmo.yaml
@@ -0,0 +1,65 @@
+version: v2
+description: open-instruct-finetune
+budget: ai2/oe-adapt
+tasks:
+  - name: open-instruct-finetune
+    image:
+      beaker: nathanl/open_instruct_auto
+    command: [
+      '/bin/sh', '-c'
+    ]
+    arguments: ['pip install git+https://github.com/vwxyzjn/transformers.git@olmo1124_classification && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
+      --mixed_precision bf16
+      --num_machines 1
+      --num_processes 4
+      --use_deepspeed
+      --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf
+      open_instruct/finetune.py
+      --model_name_or_path /hf_llama_models
+      --use_flash_attn
+      --max_seq_length 2048
+      --preprocessing_num_workers 16
+      --per_device_train_batch_size 2
+      --gradient_accumulation_steps 16
+      --learning_rate 2e-5
+      --lr_scheduler_type linear
+      --warmup_ratio 0.03
+      --weight_decay 0.
+      --num_train_epochs 2
+      --output_dir /output/
+      --with_tracking
+      --report_to tensorboard
+      --logging_steps 1
+    ']
+    envVars:
+      - name: CUDA_DEVICE_ORDER
+        value: PCI_BUS_ID
+      - name: TRANSFORMERS_CACHE
+        value: ./cache/
+      - name: WANDB_API_KEY
+        secret: jacobm_WANDB_API_KEY
+      - name: WANDB_PROJECT
+        value: open-instruct
+      - name: WANDB_WATCH
+        value: false
+      - name: WANDB_LOG_MODEL
+        value: false
+      - name: WANDB_DISABLED
+        value: true
+      - name: HF_TOKEN
+        secret: jacobm_HF_TOKEN
+    datasets:
+      - mountPath: /oe-adapt-default
+        source:
+          weka: oe-adapt-default
+      - mountPath: /oe-training-default
+        source:
+          weka: oe-training-default
+    result:
+      path: /output
+    resources:
+      gpuCount: 4
+    context:
+      cluster: ai2/allennlp-cirrascale
+      priority: high
+      preemptible: false
\ No newline at end of file
diff --git a/configs/beaker_configs/default_merge.yaml b/configs/beaker_configs/default_merge.yaml
new file mode 100644
index 000000000..446acee54
--- /dev/null
+++ b/configs/beaker_configs/default_merge.yaml
@@ -0,0 +1,40 @@
+version: v2
+description: open-instruct-merge-models
+budget: ai2/oe-adapt
+tasks:
+  - name: open-instruct-merge-models
+    image:
+      beaker: nathanl/open_instruct_auto
+    command: [
+      '/bin/sh', '-c'
+    ]
+    arguments: ['mkdir {OUTPUT_DIR}; echo {RAW_CONFIG} > {OUTPUT_DIR}/config.yaml; mergekit-yaml {OUTPUT_DIR}/config.yaml {OUTPUT_DIR} --cuda']
+    envVars:
+      - name: CUDA_DEVICE_ORDER
+        value: PCI_BUS_ID
+      - name: TRANSFORMERS_CACHE
+        value: ./cache/
+      - name: WANDB_API_KEY
+        secret: jacobm_WANDB_API_KEY
+      - name: WANDB_PROJECT
+        value: open-instruct
+      - name: WANDB_WATCH
+        value: false
+      - name: WANDB_LOG_MODEL
+        value: false
+      - name: WANDB_DISABLED
+        value: true
+      - name: HF_TOKEN
+        secret: jacobm_HF_TOKEN
+    result:
+      path: /output
+    resources:
+      gpuCount: 1
+    context:
+      priority: low
+      preemptible: true
+    constraints:
+      cluster:
+        - ai2/neptune-cirrascale
+        - ai2/saturn-cirrascale
+        - ai2/jupiter-cirrascale-2
\ No newline at end of file
diff --git a/configs/merge_configs/70b-soup.yaml b/configs/merge_configs/70b-soup.yaml
new file mode 100644
index 000000000..6ba42cf09
--- /dev/null
+++ b/configs/merge_configs/70b-soup.yaml
@@ -0,0 +1,18 @@
+merge_method: linear
+normalize: true
+models:
+  # - name: L3.1-70B-v3.9-nc-2e-6-2_ep-fixed
+  #   location: weka
+  #   path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed/
+  #   wekaBucket: "oe-adapt-default"
+  #   weight: 1.0
+  - name: L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-2
+    location: weka
+    path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-2/
+    wekaBucket: "oe-adapt-default"
+    weight: 1.0
+  - name: L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-3
+    location: weka
+    path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-3/
+    wekaBucket: "oe-adapt-default"
+    weight: 1.0
\ No newline at end of file
diff --git a/configs/merge_configs/base_configs/default_linear_merge.yaml b/configs/merge_configs/base_configs/default_linear_merge.yaml
new file mode 100644
index 000000000..3557bc440
--- /dev/null
+++ b/configs/merge_configs/base_configs/default_linear_merge.yaml
@@ -0,0 +1,10 @@
+models:
+  - model: /model-one
+    parameters:
+      weight: 1.0
+  - model: /model-two
+    parameters:
+      weight: 1.0
+normalize: true
+merge_method: linear
+dtype: bfloat16
\ No newline at end of file
diff --git a/configs/merge_configs/base_configs/default_task_arithmetic_merge.yaml b/configs/merge_configs/base_configs/default_task_arithmetic_merge.yaml
new file mode 100644
index 000000000..e2f55159a
--- /dev/null
+++ b/configs/merge_configs/base_configs/default_task_arithmetic_merge.yaml
@@ -0,0 +1,14 @@
+models:
+    # no parameters necessary for base model
+  - model: /base-model
+  - model: /model-one
+    parameters:
+      weight: 0.70
+      normalize: False
+  - model: /model-2
+    parameters:
+      weight: 0.30
+      normalize: False
+merge_method: task_arithmetic
+base_model: /base-model
+dtype: bfloat16
\ No newline at end of file
diff --git a/configs/merge_configs/example_linear_merge_config.yaml b/configs/merge_configs/example_linear_merge_config.yaml
new file mode 100644
index 000000000..8f902c28f
--- /dev/null
+++ b/configs/merge_configs/example_linear_merge_config.yaml
@@ -0,0 +1,11 @@
+merge_method: linear
+normalize: true
+models:
+  - name: name
+    location: beaker
+    path: jacobm/beaker-dataset
+    weight: 0.5
+  - name: name2
+    location: huggingface
+    path: allenai/llama-3-tulu-2
+    weight: 0.5
\ No newline at end of file
diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml
new file mode 100644
index 000000000..3c7246a6d
--- /dev/null
+++ b/configs/merge_configs/my-merge-config.yaml
@@ -0,0 +1,48 @@
+merge_method: linear
+normalize: true
+models:
+  # - name: llama-3.1-8b-resized
+  #   location: huggingface
+  #   path: ai2-adapt-dev/llama-3.1-8b-resized
+  #   weight: 0.5
+  # - name: L3.1-8B-v3.9-nc-fixed-soup-best_2
+  #   location: weka
+  #   path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-best_2/
+  #   wekaBucket: "oe-adapt-default"
+  #   weight: 0.5
+
+  - name: gsm_math_if_valpy_best_overall_avg_8b_beta0.05-step_200
+    location: weka
+    path: /oe-adapt-default/hamishi/model_checkpoints/gsm_math_if_valpy_best_overall_avg_8b_beta0.05_checkpoints/step_200/
+    wekaBucket: "oe-adapt-default"
+    weight: 1.0
+  - name: gsm_math_if_valpy_best_and_if_avg_8b_beta0.05-step_200
+    location: weka
+    path: /oe-adapt-default/hamishi/model_checkpoints/gsm_math_if_valpy_best_and_if_avg_8b_beta0.05_checkpoints/step_200/
+    wekaBucket: "oe-adapt-default"
+    weight: 1.0
+  # - name: L3.1-8B-v3.9-nc-fixed-2
+  #   location: weka
+  #   path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-2/
+  #   wekaBucket: "oe-adapt-default"
+  #   weight: 1.0
+  # - name: L3.1-8B-v3.9-nc-fixed-3
+  #   location: weka
+  #   path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-3/
+  #   wekaBucket: "oe-adapt-default"
+  #   weight: 1.0
+  # - name: L3.1-8B-v3.9-nc-fixed-1
+  #   location: weka
+  #   path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-1/
+  #   wekaBucket: "oe-adapt-default"
+  #   weight: 1.0
+  # - name: L3.1-8B-v3.9-nc-fixed-5
+  #   location: weka
+  #   path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-5/
+  #   wekaBucket: "oe-adapt-default"
+  #   weight: 1.0
+  # - name: L3.1-8B-v3.9-nc-fixed-4
+  #   location: weka
+  #   path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-4/
+  #   wekaBucket: "oe-adapt-default"
+  #   weight: 1.0
\ No newline at end of file
diff --git a/configs/merge_configs/my-task-arithmetic-config.yaml b/configs/merge_configs/my-task-arithmetic-config.yaml
new file mode 100644
index 000000000..d91e75d30
--- /dev/null
+++ b/configs/merge_configs/my-task-arithmetic-config.yaml
@@ -0,0 +1,22 @@
+merge_method: task_arithmetic
+base_model: ai2-adapt-dev/llama-3.1-8b-resized
+normalize: true
+models:
+  # - name: deepseek-math-7b-instruct
+  #   location: huggingface
+  #   path: deepseek-ai/deepseek-math-7b-instruct
+  #   weight: 0.5
+  # - name: deepseek-coder-7b-instruct-v1.5
+  #   location: huggingface
+  #   path: deepseek-ai/deepseek-coder-7b-instruct-v1.5
+  #   weight: 0.5
+  - name: L3.1-8B-v3.8-nc-final
+    location: weka
+    path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-nc-final/
+    wekaBucket: "oe-adapt-default"
+    weight: 1.0
+  - name: L3.1-8B-v3.8-math_subset
+    location: weka
+    path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-math_subset/
+    wekaBucket: "oe-adapt-default"
+    weight: 0.46
\ No newline at end of file
diff --git a/configs/train_configs/dpo/my-test-dpo.yaml b/configs/train_configs/dpo/my-test-dpo.yaml
new file mode 100644
index 000000000..543b50c53
--- /dev/null
+++ b/configs/train_configs/dpo/my-test-dpo.yaml
@@ -0,0 +1,31 @@
+model_name_or_path: /model
+tokenizer_name: /model
+model_revision: main
+use_flash_attn: true
+gradient_checkpointing: true
+# dataset_name: ai2-adapt-dev/tulu3.4-sft-replica-50k
+# dataset_config_name: gpt4-prefs-on-policy
+dataset_mixer:
+  ai2-adapt-dev/tulu3.4-sft-replica-50k-gpt4-prefs-on-policy: 1.0
+  ai2-adapt-dev/personahub_if_pref_data_manualseed_v2_19890: 1.0
+  ai2-adapt-dev/helpsteer2-uf-pipeline-regen: 1.0
+  allenai/ultrafeedback_binarized_cleaned_train: 1.0
+use_slow_tokenizer: true
+max_seq_length: 2048
+preprocessing_num_workers: 16
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 16 # designed for 8 GPUs, so batch size 128
+learning_rate: 5.0e-7
+lr_scheduler_type: linear
+warmup_ratio: 0.1
+weight_decay: 0.0
+num_train_epochs: 1
+output_dir: /output
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+use_lora: false
+dpo_loss_type: dpo_norm
+dpo_beta: 5
+checkpointing_steps: 1000
\ No newline at end of file
diff --git a/configs/train_configs/dpo/olmoe_dpo_test.yaml b/configs/train_configs/dpo/olmoe_dpo_test.yaml
new file mode 100644
index 000000000..bed7f3037
--- /dev/null
+++ b/configs/train_configs/dpo/olmoe_dpo_test.yaml
@@ -0,0 +1,37 @@
+model_name_or_path: /model
+tokenizer_name: /model
+model_revision: main
+use_flash_attn: true
+gradient_checkpointing: true
+dataset_mixer:
+  # ai2-adapt-dev/sft_v3.9_used_off_policy: 1.0
+  # ai2-adapt-dev/sft_v3.9_used_on_policy_large_70b_ckpt: 1.0
+  # ai2-adapt-dev/DaringAnteater-prefs-RM-filter-uf-pipeline-regen-v3.9_large_70b_ckpt: 1.0
+  # ai2-adapt-dev/WildChat-prefs-280824-uf-pipeline-regen-v3.9_large_70b_ckpt: 1.0
+  # ai2-adapt-dev/Llama-3.1-if_taxonomy_tulu-uf-pipeline-regen-v3.9_large_70b_ckpt: 1.0
+  ai2-adapt-dev/wildchat_v3.9_unused_off_policy: 1.0
+
+  ai2-adapt-dev/sft_v3.9_used_p0_olmoe-1b-7b: 1.0
+  ai2-adapt-dev/sft_v3.9_used_p1_olmoe-1b-7b: 1.0
+  ai2-adapt-dev/daring_anteater_olmoe-1b-7b: 1.0
+  ai2-adapt-dev/wildchat-prefs-280824_olmoe-1b-7b: 1.0
+  ai2-adapt-dev/llama3.1-if_taxonomy_tulu_olmoe-1b-7b: 1.0
+use_slow_tokenizer: true
+max_seq_length: 2048
+preprocessing_num_workers: 16
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 8 # designed for 8 GPUs, so batch size 128
+learning_rate: 5.0e-7
+lr_scheduler_type: linear
+warmup_ratio: 0.1
+weight_decay: 0.0
+num_train_epochs: 1
+output_dir: /output
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+use_lora: false
+dpo_loss_type: dpo_norm
+dpo_beta: 5
+checkpointing_steps: 1000
\ No newline at end of file
diff --git a/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml b/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml
new file mode 100644
index 000000000..9fe931118
--- /dev/null
+++ b/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml
@@ -0,0 +1,56 @@
+model_name_or_path: allenai/open_instruct_dev
+model_revision: peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-olmo_1124
+use_flash_attn: true
+tokenizer_name: allenai/open_instruct_dev
+use_slow_tokenizer: true
+dataset_mixer:
+    # Static v3.9 huggingface dataset
+    # allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0
+
+    # General datasets:
+    ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all
+    ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all
+    ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all
+    ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all
+    ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0
+
+    # Math datasets:
+    ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all
+    allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all
+    ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 
+    ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0
+    ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0
+
+    # Coding datasets:
+    ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all
+    ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all
+
+    # IF datasets:
+    ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all
+
+    # Safety datasets:
+    # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all
+    # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0
+    # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0
+
+    # Specialty datasets:
+    ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0
+    ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0
+    ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0
+
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes
+learning_rate: 5.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
diff --git a/configs/train_configs/sft/olmoe_v3.9.yaml b/configs/train_configs/sft/olmoe_v3.9.yaml
new file mode 100644
index 000000000..c4b61014c
--- /dev/null
+++ b/configs/train_configs/sft/olmoe_v3.9.yaml
@@ -0,0 +1,52 @@
+model_name_or_path: allenai/OLMoE-1B-7B-0924
+model_revision: main
+tokenizer_name: allenai/OLMoE-1B-7B-0924
+use_slow_tokenizer: true
+dataset_mixer:
+    allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0
+
+    # # General datasets:
+    # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all
+    # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all
+    # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all
+    # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all
+    # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0
+
+    # # Math datasets:
+    # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all
+    # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all
+    # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 
+    # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0
+    # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0
+
+    # # Coding datasets:
+    # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all
+    # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all
+
+    # # IF datasets:
+    # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all
+
+    # # Specialty datasets:
+    # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0
+    # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0
+    # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0
+max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
+preprocessing_num_workers: 128
+per_device_train_batch_size: 2 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 1 # effective batch size 128 with 4 nodes
+learning_rate: 2.0e-05
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+dataset_mix_dir: /output/
+checkpointing_steps: epoch
+# keep_last_n_checkpoints: 1
+# load_balancing_loss: false # TODO: set to false
+# load_balancing_weight: 0.5
+add_bos: true
\ No newline at end of file
diff --git a/configs/train_configs/sft/peteish_1124_preview_mix_v3.9.yaml b/configs/train_configs/sft/peteish_1124_preview_mix_v3.9.yaml
new file mode 100644
index 000000000..e56ed1398
--- /dev/null
+++ b/configs/train_configs/sft/peteish_1124_preview_mix_v3.9.yaml
@@ -0,0 +1,59 @@
+model_name_or_path: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan/step11931-hf
+model_revision: main
+use_flash_attn: true
+tokenizer_name: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan/step11931-hf
+use_slow_tokenizer: true
+dataset_mixer:
+    # Static v3.9 nc mix file
+    # WIP
+
+    # Static v3.9 huggingface dataset
+    allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0
+
+    # # General datasets:
+    # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all
+    # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all
+    # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all
+    # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all
+    # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0
+
+    # # Math datasets:
+    # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all
+    # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 # 49980 # all
+    # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 
+    # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0
+    # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0
+
+    # # Coding datasets:
+    # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all
+    # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all
+
+    # # IF datasets:
+    # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all
+
+    # # Safety datasets:
+    # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all
+    # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0
+    # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0
+
+    # # Specialty datasets:
+    # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0
+    # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0
+    # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0
+
+max_seq_length: 4096 # need to increase to 8k
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
diff --git a/configs/train_configs/sft/qwen2.5_7b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/qwen2.5_7b_preview_mix_v3.9-noncommercial.yaml
new file mode 100644
index 000000000..c5c8b488d
--- /dev/null
+++ b/configs/train_configs/sft/qwen2.5_7b_preview_mix_v3.9-noncommercial.yaml
@@ -0,0 +1,56 @@
+model_name_or_path: Qwen/Qwen2.5-Math-7B
+model_revision: main
+use_flash_attn: true
+tokenizer_name: Qwen/Qwen2.5-Math-7B
+use_slow_tokenizer: true
+dataset_mixer:
+    # Static v3.9 huggingface dataset
+    allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0
+
+    # # General datasets:
+    # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all
+    # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all
+    # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all
+    # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all
+    # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0
+
+    # # Math datasets:
+    # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all
+    # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all
+    # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 
+    # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0
+    # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0
+
+    # # Coding datasets:
+    # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all
+    # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all
+
+    # # IF datasets:
+    # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all
+
+    # # Safety datasets:
+    # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all
+    # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0
+    # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0
+
+    # # Specialty datasets:
+    # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0
+    # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0
+    # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0
+
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 4 # effective batch size 128 with 8 nodes
+learning_rate: 5.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
diff --git a/configs/train_configs/sft/train-math-only-model.yaml b/configs/train_configs/sft/train-math-only-model.yaml
new file mode 100644
index 000000000..86a600f9d
--- /dev/null
+++ b/configs/train_configs/sft/train-math-only-model.yaml
@@ -0,0 +1,179 @@
+model_name_or_path: meta-llama/Meta-Llama-3-8B
+model_revision: main
+use_flash_attn: true
+tokenizer_name: meta-llama/Meta-Llama-3-8B
+use_slow_tokenizer: true
+dataset_mixer:
+    # ------------------------------------------------------
+    # no_robot dataset, human written, for general chat. 
+    # Total: 9500
+    # Pro: created by scale ai with high cost, should be high quality.
+    # Con: small, not diverse enough, may not be in consistent style.
+    HuggingFaceH4/no_robots: 9500
+    # ------------------------------------------------------
+    # OpenAssistant dataset, human written, for general chat.
+    # Here, only the highest rated paths are extracted.
+    # Total: 7708
+    # Pro: created and reviewed by human volunteers, has multi-turn chat.
+    # Con: small, still has some noise, the writting quality may not be as good/careful as paid workers, style consistency.
+    # TODO: need to check if this version corresponds to the highest rated paths.
+    allenai/openassistant-guanaco-reformatted: 7708
+    # ------------------------------------------------------
+    # LIMA dataset, human written, for general chat.
+    # Some instances were filtered in building Tulu 2, probably due to some identity keywords.
+    # Total: 1018
+    # Pro: created by researchers at Meta, aiming for diversity and high quality.
+    # Con: small, they were created quite early so might not consider some of the latest answering styles of chatbot.
+    # natolambert/tulu-v2-sft-mixture-lima: 1018
+    # ------------------------------------------------------
+    # Aya dataset, human written, for general chat (multilingual).
+    # Total: 202362
+    # Pro: created by ..., aiming for very diverse languages ().
+    # Con: answers may not be in the perfect style.
+    # ai2-adapt-dev/aya_dataset-reformat: 202362
+    # ------------------------------------------------------
+    # Tulu hard-coded examples, human written, for identity-related questions.
+    # Total: 14
+    # Pro: necessary to make Tulu aware of itself and its builders.
+    # Con: small, low coverage of possible questions from users.
+    # TODO: we should later find ways to replicate this multiple times.
+    ai2-adapt-dev/tulu_hard_coded_examples: 14
+    # ------------------------------------------------------
+    # CoT subset in FLAN v2, human (researchers) converted from existing datasets, for reasoning.
+    # Here, we use the subset processed in Tulu v2.
+    # Total: 48747
+    # Pro: researchers converted from 9 chain-of-thought datasets about arithmetics, multi-hop reasoning, nli.
+    # Con: limited in the task type, written early, may have inconsistent styles compared to today's chatbot.
+    # natolambert/tulu-v2-sft-mixture-cot: 49747
+    # ------------------------------------------------------
+    # SciIFF dataset, human (researchers) converted from existing datasets, for scientific literature understanding.
+    # Here, we use the subset extracted by the author in building allenai/SciRIFF-train-mix.
+    # Total: 35357
+    # Pro: researchers converted from existing datasets for 54 scientific literature understanding tasks
+    # Con: limited in the task type, may have inconsistent styles compared to today's chatbot.
+    # TODO: need to ablate and compare with the one in tulu 2 mixture natolambert/tulu-v2-sft-mixture-science
+    # natolambert/tulu-v2-sft-mixture-science: 7468  # original data slightly different
+    # ai2-adapt-dev/SciRIFF-train-mix-science: 10000
+    # ------------------------------------------------------
+    # SlimOrca dataset, gpt4 generated, for general chat.
+    # Total: 517982
+    # Pro: Paring FLAN v2 inputs with system prompts, and regenerating the outputs using GPT4, potentially in a better style.
+    # Con: GPT4 responses may contain errors, which may be mitagated by the filtering in SlimOrca
+    # TODO: need to need to ablate and compare with the 300K one Faeze created. may benefit from regeneration.
+    # ai2-adapt-dev/slim-orca-300k: 100000
+    ai2-adapt-dev/SlimOrca-reformat: 100000
+    # ------------------------------------------------------
+    # WizardLM eval instruct dataset, gpt4 generated, for general chat.
+    # Total: 196000
+    # Pro: the approach deepens the complexity of gpt4-generated data
+    # Con: GPT4 generations have eorrs, may also inheritate the biases/styles in GPT4
+    # TODO: need to ablate.
+    WizardLMTeam/WizardLM_evol_instruct_V2_196k: 30000
+    # ------------------------------------------------------
+    # WildChat dataset, real user queries + gpt4 responses, for general chat.
+    # Total: 254663 (1M if including those interacting with gpt 3.5)
+    # Pro: real user queries, may contain diverse challenging scenarios, as well as unsafe prompts. Mutli-turn.
+    # Con: user queries are usually not that well-formated, and contain a lot of noises.
+    # ai2-adapt-dev/WildChat-1M-Full-GPT4-Only: 254663
+    # ------------------------------------------------------
+    # ShareGPT dataset, real user shared queries + gpt4 responses, for general chat.
+    # Total: 114046
+    # Pro: user shared queries usually contain interesting phenomena. Multi-turn.
+    # Con: unsure licensing, the responses were generated using earlier version of GPT4.
+    # TODO: need to ablate. May benefit from regeneration.
+    # Vtuber-plan/sharegpt-cleaned: 114046
+    # ------------------------------------------------------
+    # Daring-Anteater, a mix of existing datasets, for general chat.
+    # Total: 99532
+    # Pro: a good mix of precise_instruction_following / json_format_following / complex instructions.
+    # Con: the constraint following part is too small.
+    # TODO: need to ablate if exclusing the main chat subset is helpful.
+    # TODO: data needs to be reformatted to consider the system prompt.
+    ai2-adapt-dev/Daring-Anteater-reformat: 99532
+    # ------------------------------------------------------
+    # MetaMathQA dataset, augmented using gpt4, for math capability.
+    # Total: 395000
+    # Pro: augmented towards GSM/MATH, so good performance on these two benchmarks (probably similar questions too)
+    # Con: may be too targeted for the two benchmarks and fail to generalize to other math problems in different styles.
+    # ai2-adapt-dev/metamath-qa-reformat: 100000
+    # ------------------------------------------------------
+    # WebInstruct dataset, extract&rewritten using gpt4, (mainly) for math/science related questions
+    # Here, we are using their released subset.
+    # Total: 2335220
+    # Pro: the generation benefits from GPT4 answering style & the correctness of grounding to web documents.
+    # Con: may be biased by the response styles in the three websites (MathStackExchange, ScienceStackExchange, Socratic);
+    #      the question answering style are also not diverse enough, with different instruction constraints;
+    #      the answer may still have some errors (10% based on the paper)
+    # TODO: need to ablate the effect.
+    # ai2-adapt-dev/WebInstructSub-reformat: 100000
+    # ------------------------------------------------------
+    # Codefeedback Filtered Instruction, a mix of existing dataset, for coding
+    # The data mix includes:
+    #   Magicoder-OSS-Instruct
+    #   Python code subset of ShareGPT
+    #   Magicoder-Evol-Instruct
+    #   Evol-Instruct-Code
+    # Total: 156526
+    # Pro: a decent mix of existing coding prompts
+    # Con: curated mainly for the prompts in building the real CodeFeedback, so responses may be low quality (e.g., ShareGPT)
+    # TODO: change to individual dataset and ablate the effect. may benefit from regeneration.
+    # m-a-p/CodeFeedback-Filtered-Instruction: 156526
+    # ------------------------------------------------------
+    # Codefeedback dataset, a mix of existing dataset + feedback interaction generation, for coding
+    # Total: 66383
+    # Pro: single-turn packing + interaction simulation seems to create good coding model that takes feedback in multi turn.
+    # Con: not sure how diverse the feedback is and how well it can generalize
+    # TODO: need to ablate. need to change code for downweight the intermediate responses with errors!!!
+    # m-a-p/Code-Feedback: 66383
+    # ------------------------------------------------------
+    # Table-GPT dataset, converted & synthesized, for table understanding and operations
+    # Total: 13222
+    # Pro: a special dataset that contains 14 table related tasks for enhancing table capabilities
+    # Con: task types are limited. The tables may not be big enough. Reponse styles may be inconsistent.
+    # TODO: need to ablate. 
+    # ai2-adapt-dev/Table-GPT-All-train: 3000
+    # ------------------------------------------------------
+    # Coconot dataset, generated by gpt4, for non-compliance
+    # Total: 11477
+    # Pro: a special dataset for the a comprehenvise list of non-compliance behaviors of models.
+    # Con: the generated queries may only reflect simple cases.
+    # TODO: need to ablate.
+    # ai2-adapt-dev/coconot-sft-reformat: 11477
+    # ------------------------------------------------------
+    # NuminaMATH-TIR, extracted and generated by gpt4, for tool-integrated reasoning for math
+    # Total: 72441
+    # Pro: generally high-quality dataset with mined prompts from web corpus, verified tool-integrated reasoning trajatories.
+    # Con: mainly for solving math in a specific format, not in a consistent format with the general chat.
+    # TODO: need to ablate. need to rewrite!!!
+    AI-MO/NuminaMath-TIR: 72441
+    # AI-MO/NuminaMath-CoT: 859000
+    # ------------------------------------------------------
+    # Xlam function calling dataset, synthesized and verified, for tool use
+    # Total: 60000
+    # Pro: a special dataset for enhancing function calling capability, good performance on BFCL
+    # Con: responses only contain the function calling and arguments, not in a consistent style with the general chat.
+    # TODO: need to ablate. need to rewrite!!!
+    # Salesforce/xlam-function-calling-60k: 60000
+    # ------------------------------------------------------
+    # Lmsys chatbot arena data, human queries for challenging models, for general chat.
+    # Total: 1000000
+    # Pro: real human interaction with model, with reasonable challenges.
+    # Con: may not reflect the real challenges in actual use of AI models. The interactions include those with weak models.
+    # TODO: need to ablate. need to regenerate (the last step)!! the intermediate low-quality responese need to downweight.
+    # lmsys/lmsys-chat-1m: 1000000
+max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes
+learning_rate: 5.0e-06 # best LR so far
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
\ No newline at end of file
diff --git a/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml
index 44ab2b8ce..140ed2bb6 100644
--- a/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml
+++ b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml
@@ -55,4 +55,4 @@ logging_steps: 1
 dataset_mix_dir: /output/
 checkpointing_steps: 1000
 keep_last_n_checkpoints: 1
-dataset_mix_dir: /output/
\ No newline at end of file
+dataset_mix_dir: /output/
diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml
new file mode 100644
index 000000000..22a574f4f
--- /dev/null
+++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml
@@ -0,0 +1,70 @@
+model_name_or_path: meta-llama/Llama-3.1-8B
+model_revision: main
+use_flash_attn: true
+tokenizer_name: meta-llama/Llama-3.1-8B
+use_slow_tokenizer: true
+dataset_mixer:
+    # Static v3.8 nc mix file
+    # /oe-adapt-default/jacobm/tulu-3-dev/data/tulu_v3.8_preview_nc.jsonl: 1.0
+    allenai/tulu-v.3.8-mix-preview-noncommercial: 1.0
+
+
+    # # # General datasets:
+    # ai2-adapt-dev/oasst1_converted: 7132 # all
+    # ai2-adapt-dev/flan_v2_converted: 89982 # all
+    # ai2-adapt-dev/tulu_hard_coded_repeated_10: 240 # all
+    # ai2-adapt-dev/no_robots_converted: 9500 # all
+    # ai2-adapt-dev/wildchat_gpt4_converted: 100000
+
+    # # # Math datasets:
+    # ai2-adapt-dev/personahub_math_v5_regen_149960: 149960 # all
+    # ai2-adapt-dev/personahub_grade_math_v1_49980: 49980 # all
+    # ai2-adapt-dev/open_math_2_gsm8k_converted: 50000 
+    # AI-MO/NuminaMath-TIR: 72441 # all
+    # ai2-adapt-dev/personahub_math_interm_algebra_50000: 20000
+
+    # # Coding datasets:
+    # ai2-adapt-dev/personahub_code_v2_34999: 34999 # all
+    # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 107276 # all
+
+    # # IF datasets:
+    # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 29980 # all
+
+    # # Safety datasets:
+    # ai2-adapt-dev/coconot_converted: 10983 # all
+    # ai2-adapt-dev/processed-wildjailbreak: 50000
+    # ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000
+
+    # # Specialty datasets:
+    # ai2-adapt-dev/sciriff_converted: 10000
+    # ai2-adapt-dev/table_gpt_converted: 5000
+    # ai2-adapt-dev/aya_dataset_converted: 100000
+
+    # # need to split for preferences:
+    # ai2-adapt-dev/wildchat_gpt4_converted: 100000
+    #   # ai2-adapt-dev/tulu_v3.8_unused_wildchat_prompts
+    #   # ai2-adapt-dev/tulu_v3.8_unused_wildchat_conversations
+    # ai2-adapt-dev/open_math_2_gsm8k_converted: 50000
+    # ai2-adapt-dev/personahub_math_interm_algebra_50000: 20000
+    # ai2-adapt-dev/processed-wildjailbreak: 50000
+    # ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000
+    # ai2-adapt-dev/sciriff_converted: 10000
+    # ai2-adapt-dev/table_gpt_converted: 5000
+    # ai2-adapt-dev/aya_dataset_converted: 100000
+
+max_seq_length: 4096 # need to increase to 8k
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 2 # effective batch size 128 with 1 node
+learning_rate: 5.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml
index 834dc99ff..fd1f81ee2 100644
--- a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml
+++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml
@@ -41,7 +41,7 @@ dataset_mixer:
 max_seq_length: 4096 # need to increase to 8k
 preprocessing_num_workers: 128
 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
-gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes
+gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes
 learning_rate: 5.0e-06
 lr_scheduler_type: linear
 warmup_ratio: 0.03
diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial-augusta.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial-augusta.yaml
new file mode 100644
index 000000000..ebb18fee4
--- /dev/null
+++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial-augusta.yaml
@@ -0,0 +1,59 @@
+model_name_or_path: /model
+model_revision: main
+use_flash_attn: true
+tokenizer_name: /model
+use_slow_tokenizer: true
+dataset_mixer:
+    # Static v3.9 nc mix file
+    # WIP
+
+    # Static v3.9 huggingface dataset
+    # allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0
+
+    # General datasets:
+    ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all
+    ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all
+    ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all
+    ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all
+    ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0
+
+    # Math datasets:
+    ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all
+    allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all
+    ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 
+    ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0
+    ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0
+
+    # Coding datasets:
+    ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all
+    ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all
+
+    # IF datasets:
+    ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all
+
+    # Safety datasets:
+    ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all
+    ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0
+    ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0
+
+    # Specialty datasets:
+    ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0
+    ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0
+    ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0
+
+max_seq_length: 4096 # need to increase to 8k
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 2 # effective batch size 128 with 4 nodes
+learning_rate: 5.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml
index 93dc19d98..84a523f4e 100644
--- a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml
+++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml
@@ -5,43 +5,43 @@ tokenizer_name: meta-llama/Llama-3.1-8B
 use_slow_tokenizer: true
 dataset_mixer:
     # Static v3.9 huggingface dataset
-    allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0
+    # allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0
 
-    # # General datasets:
-    # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all
-    # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all
-    # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all
-    # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all
-    # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0
+    # General datasets:
+    ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all
+    ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all
+    ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all
+    ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all
+    ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0
 
-    # # Math datasets:
-    # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all
-    # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all
-    # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 
-    # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0
-    # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0
+    # Math datasets:
+    ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all
+    allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all
+    ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 
+    ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0
+    ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0
 
-    # # Coding datasets:
-    # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all
-    # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all
+    # Coding datasets:
+    ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all
+    ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all
 
-    # # IF datasets:
-    # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all
+    # IF datasets:
+    ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all
 
-    # # Safety datasets:
-    # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all
-    # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0
-    # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0
+    # Safety datasets:
+    ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all
+    ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0
+    ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0
 
-    # # Specialty datasets:
-    # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0
-    # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0
-    # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0
+    # Specialty datasets:
+    ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0
+    ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0
+    ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0
 
 max_seq_length: 4096
 preprocessing_num_workers: 128
 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
-gradient_accumulation_steps: 2 # effective batch size 128 with 8 nodes
+gradient_accumulation_steps: 8 # effective batch size 128 with 8 nodes
 learning_rate: 5.0e-06
 lr_scheduler_type: linear
 warmup_ratio: 0.03
@@ -54,4 +54,4 @@ report_to:
 logging_steps: 1
 checkpointing_steps: 1000
 keep_last_n_checkpoints: 1
-dataset_mix_dir: /output/
\ No newline at end of file
+dataset_mix_dir: /output/
diff --git a/downsampling.pdf b/downsampling.pdf
new file mode 100644
index 000000000..8002b7a48
Binary files /dev/null and b/downsampling.pdf differ
diff --git a/downsampling_bars.pdf b/downsampling_bars.pdf
new file mode 100644
index 000000000..972ef4112
Binary files /dev/null and b/downsampling_bars.pdf differ
diff --git a/oe-eval-internal b/oe-eval-internal
new file mode 160000
index 000000000..4c104ac6b
--- /dev/null
+++ b/oe-eval-internal
@@ -0,0 +1 @@
+Subproject commit 4c104ac6b4fd05d1d0f83d3d2e6a46eb77efc592
diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh
index f66d250eb..5f74d76c8 100755
--- a/scripts/eval/oe-eval.sh
+++ b/scripts/eval/oe-eval.sh
@@ -151,5 +151,5 @@ for TASK in "${TASKS[@]}"; do
         GPU_COUNT=$GPU_COUNT
     fi
     
-    python oe-eval-internal/oe_eval/launch.py --model "$MODEL_NAME" --beaker-workspace "ai2/tulu-3-results" --beaker-budget ai2/oe-adapt --task "$TASK" $MODEL_TYPE --batch-size "$BATCH_SIZE" --model-args "{\"model_path\":\"${MODEL_LOCATION}\", \"max_length\": ${MAX_LENGTH}}" ${HF_UPLOAD_ARG} --gpus "$GPU_COUNT" --gantry-args '{"env-secret": "OPENAI_API_KEY=openai_api_key"}' ${REVISION_ARG} --beaker-retries 2 --beaker-priority "$PRIORITY"
+    python oe-eval-internal/oe_eval/launch.py --model "$MODEL_NAME" --beaker-workspace "ai2/tulu-3-dev" --beaker-budget ai2/oe-adapt --task "$TASK" $MODEL_TYPE --batch-size "$BATCH_SIZE" --model-args "{\"model_path\":\"${MODEL_LOCATION}\", \"max_length\": ${MAX_LENGTH}}" ${HF_UPLOAD_ARG} --gpus "$GPU_COUNT" --gantry-args '{"env-secret": "OPENAI_API_KEY=jacobm_OPENAI_API_KEY", "weka": "oe-adapt-default:/oe-adapt-default"}' ${REVISION_ARG} --beaker-retries 2 --beaker-priority "urgent" --gantry-secret-hf-write "jacobm_HF_TOKEN"
 done
diff --git a/scripts/filter-v3.8-data.py b/scripts/filter-v3.8-data.py
new file mode 100644
index 000000000..0d81a3881
--- /dev/null
+++ b/scripts/filter-v3.8-data.py
@@ -0,0 +1,111 @@
+from datasets import load_dataset
+
+full_ds = load_dataset("allenai/tulu-v.3.8-mix-preview-noncommercial")
+
+conversations = set()
+prompts = set()
+
+for elem in full_ds["train"]:
+    conv = ""
+    prompt = elem["messages"][0]["content"]
+    prompts.add(prompt)
+    for msg in elem["messages"]:
+        conv += msg["content"]
+    conversations.add(conv)
+
+    ### Not using anymore:
+    # ai2-adapt-dev/wildchat_gpt4_converted: 100000
+    #   # ai2-adapt-dev/tulu_v3.8_unused_wildchat_prompts
+    #   # ai2-adapt-dev/tulu_v3.8_unused_wildchat_conversations
+
+seed = 42
+
+### splitting:
+
+# wildchat_gpt4_converted_safety_decontaminated: 100000
+wildchat_ds = load_dataset("ai2-adapt-dev/wildchat_gpt4_converted_safety_decontaminated").shuffle(seed)
+wildchat_ds_to_use = wildchat_ds["train"].select(range(100000))
+wildchat_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildchat_100k")
+wildchat_ds_to_not_use = wildchat_ds["train"].select(range(100000, len(wildchat_ds["train"])))
+wildchat_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildchat_unused")
+
+del wildchat_ds
+del wildchat_ds_to_use
+del wildchat_ds_to_not_use
+
+# ai2-adapt-dev/open_math_2_gsm8k_converted: 50000
+openmath2_gsm8k_ds = load_dataset("ai2-adapt-dev/open_math_2_gsm8k_converted").shuffle(seed)
+openmath2_gsm8k_to_use = openmath2_gsm8k_ds["train"].select(range(50000))
+openmath2_gsm8k_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k")
+openmath2_gsm8k_to_not_use = openmath2_gsm8k_ds["train"].select(range(50000, len(openmath2_gsm8k_ds["train"])))
+openmath2_gsm8k_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_unused")
+
+del openmath2_gsm8k_ds
+del openmath2_gsm8k_to_use
+del openmath2_gsm8k_to_not_use
+
+# ai2-adapt-dev/personahub_math_interm_algebra_50000: 20000
+p_math_alg_ds = load_dataset("ai2-adapt-dev/personahub_math_interm_algebra_50000").shuffle(seed)
+p_math_alg_ds_to_use = p_math_alg_ds["train"].select(range(20000))
+p_math_alg_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k")
+p_math_alg_ds_to_not_use = p_math_alg_ds["train"].select(range(20000, len(p_math_alg_ds["train"])))
+p_math_alg_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_unused")
+
+del p_math_alg_ds
+del p_math_alg_ds_to_use
+del p_math_alg_ds_to_not_use
+
+# ai2-adapt-dev/processed_wildjailbreak_safety_decontaminated: 50000
+wjb_ds = load_dataset("ai2-adapt-dev/processed_wildjailbreak_safety_decontaminated").shuffle(seed)
+wjb_ds_to_use = wjb_ds["train"].select(range(50000))
+wjb_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k")
+wjb_ds_to_not_use = wjb_ds["train"].select(range(50000, len(wjb_ds["train"])))
+wjb_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_unused")
+
+del wjb_ds
+del wjb_ds_to_use
+del wjb_ds_to_not_use
+
+# ai2-adapt-dev/synthetic_finalresp_wildguardmixtrain_safety_decontaminated: 50000
+wg_ds = load_dataset("ai2-adapt-dev/synthetic_finalresp_wildguardmixtrain_safety_decontaminated").shuffle(seed)
+wg_ds_to_use = wg_ds["train"].select(range(50000))
+wg_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k")
+wg_ds_to_not_use = wg_ds["train"].select(range(50000, len(wg_ds["train"])))
+wg_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_unused")
+
+del wg_ds
+del wg_ds_to_use
+del wg_ds_to_not_use
+
+# ai2-adapt-dev/sciriff_converted: 10000
+sciriff_ds = load_dataset("ai2-adapt-dev/sciriff_converted").shuffle(seed)
+sciriff_ds_to_use = sciriff_ds["train"].select(range(10000))
+sciriff_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_sciriff_10k")
+sciriff_ds_to_not_use = sciriff_ds["train"].select(range(10000, len(sciriff_ds["train"])))
+sciriff_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_sciriff_unused")
+
+del sciriff_ds
+del sciriff_ds_to_use
+del sciriff_ds_to_not_use
+
+# ai2-adapt-dev/table_gpt_converted: 5000
+table_gpt_ds = load_dataset("ai2-adapt-dev/table_gpt_converted").shuffle(seed)
+table_gpt_ds_to_use = table_gpt_ds["train"].select(range(5000))
+table_gpt_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_table_gpt_5k")
+table_gpt_ds_to_not_use = table_gpt_ds["train"].select(range(5000, len(table_gpt_ds["train"])))
+table_gpt_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_table_gpt_unused")
+
+del table_gpt_ds
+del table_gpt_ds_to_use
+del table_gpt_ds_to_not_use
+
+# ai2-adapt-dev/aya_dataset_converted: 100000
+aya_ds = load_dataset("ai2-adapt-dev/aya_dataset_converted").shuffle(seed)
+aya_ds_to_use = aya_ds["train"].select(range(100000))
+aya_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_aya_100k")
+aya_ds_to_not_use = aya_ds["train"].select(range(100000, len(aya_ds["train"])))
+aya_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_aya_unused")
+
+del aya_ds
+del aya_ds_to_use
+del aya_ds_to_not_use
diff --git a/scripts/plot-downsampling.py b/scripts/plot-downsampling.py
new file mode 100644
index 000000000..bc7b04b65
--- /dev/null
+++ b/scripts/plot-downsampling.py
@@ -0,0 +1,309 @@
+benchmark_data = {
+    "Avg.": {
+        "eval_setting": "",
+        "sft_5": 57.69,
+        "sft_10": 58.06,
+        "sft_25": 58.64,
+        "sft_50": 59.18,
+        "sft_75": 59.57,
+        "sft_full": 60.08
+    },
+    "MMLU": {
+        "eval_setting": "5 shot",
+        "sft_5": 64.1,
+        "sft_10": 63.9,
+        "sft_25": 63.4,
+        "sft_50": 62.3,
+        "sft_75": 62.1,
+        "sft_full": 62.1
+    },
+    "TruthfulQA": {
+        "eval_setting": "6 shot",
+        "sft_5": 51.0,
+        "sft_10": 50.4,
+        "sft_25": 49.9,
+        "sft_50": 48.9,
+        "sft_75": 46.4,
+        "sft_full": 46.8
+    },
+    "PopQA": {
+        "eval_setting": "15 shot",
+        "sft_5": 30.8,
+        "sft_10": 30.8,
+        "sft_25": 29.8,
+        "sft_50": 30.1,
+        "sft_75": 29.6,
+        "sft_full": 29.3
+    },
+    # TODO: BBH IS NOT UP TO DATE!!!
+    "BigBenchHard": {
+        "eval_setting": "3 shot, CoT",
+        "sft_5": 67.5,
+        "sft_10": 68.2,
+        "sft_25": 68.5,
+        "sft_50": 67.6,
+        "sft_75": 69.7,
+        "sft_full": 68.8
+    },
+    "HumanEval": {
+        "eval_setting": "pass@10",
+        "sft_5": 81.5,
+        "sft_10": 81.5,
+        "sft_25": 81.4,
+        "sft_50": 84.4,
+        "sft_75": 86.7,
+        "sft_full": 86.2
+    },
+    "HumanEval+": {
+        "eval_setting": "pass@10",
+        "sft_5": 76.1,
+        "sft_10": 77.4,
+        "sft_25": 75.5,
+        "sft_50": 78.3,
+        "sft_75": 79.5,
+        "sft_full": 81.4
+    },
+    "GSM8K": {
+        "eval_setting": "8 shot, CoT",
+        "sft_5": 66.0,
+        "sft_10": 66.3,
+        "sft_25": 72.1,
+        "sft_50": 73.8,
+        "sft_75": 74.4,
+        "sft_full": 76.2
+    },
+    "DROP": {
+        "eval_setting": "3 shot",
+        "sft_5": 60.7,
+        "sft_10": 60.7,
+        "sft_25": 59.4,
+        "sft_50": 60.7,
+        "sft_75": 59.9,
+        "sft_full": 61.3
+    },
+    "MATH": {
+        "eval_setting": "4 shot CoT, Flex",
+        "sft_5": 29.3,
+        "sft_10": 28.7,
+        "sft_25": 30.0,
+        "sft_50": 30.9,
+        "sft_75": 31.7,
+        "sft_full": 31.5
+    },
+    "IFEval": {
+        "eval_setting": "Strict",
+        "sft_5": 65.4,
+        "sft_10": 68.6,
+        "sft_25": 70.6,
+        "sft_50": 68.2,
+        "sft_75": 70.6,
+        "sft_full": 72.8
+    },
+    "AlpacaEval 2": {
+        "eval_setting": "LC % win",
+        "sft_5": 11.1,
+        "sft_10": 10.2,
+        "sft_25": 11.7,
+        "sft_50": 13.3,
+        "sft_75": 12.4,
+        "sft_full": 12.4
+    },
+    "Safety": {
+        "eval_setting": "",
+        "sft_5": 89.8,
+        "sft_10": 90.9,
+        "sft_25": 92.3,
+        "sft_50": 92.6,
+        "sft_75": 92.8,
+        "sft_full": 93.1
+    }
+}
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Create x-axis values (SFT percentages)
+x_values = [5, 10, 25, 50, 75, 100]  # 100 represents full SFT
+
+# # Create figure and axis with a larger size
+# plt.figure(figsize=(12, 8))
+
+# # Color palette for different lines
+# colors = plt.cm.tab20(np.linspace(0, 1, len(benchmark_data)))
+
+# # Plot each benchmark
+# for (benchmark, data), color in zip(benchmark_data.items(), colors):
+#     if benchmark != "Avg.":  # Skip the average for now
+#         y_values = [
+#             data["sft_5"],
+#             data["sft_10"],
+#             data["sft_25"],
+#             data["sft_50"],
+#             data["sft_75"],
+#             data["sft_full"]
+#         ]
+#         plt.plot(x_values, y_values, marker='o', label=benchmark, color=color, linewidth=2)
+
+# # Add the average line with higher emphasis
+# avg_values = [
+#     benchmark_data["Avg."]["sft_5"],
+#     benchmark_data["Avg."]["sft_10"],
+#     benchmark_data["Avg."]["sft_25"],
+#     benchmark_data["Avg."]["sft_50"],
+#     benchmark_data["Avg."]["sft_75"],
+#     benchmark_data["Avg."]["sft_full"]
+# ]
+# plt.plot(x_values, avg_values, 'k--', label='Average', linewidth=3, marker='s')
+
+# # Customize the plot
+# plt.xlabel('SFT Training Data Size', fontsize=12)
+# plt.ylabel('Performance', fontsize=12)
+# plt.title('Benchmark Performance Across Different SFT Percentages', fontsize=14)
+# plt.grid(True, linestyle='--', alpha=0.7)
+# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
+
+# # Set x-axis ticks
+# plt.xticks(x_values)
+
+# # Adjust layout to prevent label cutoff
+# plt.tight_layout()
+
+# # Show the plot
+# plt.show()
+
+# Optional: Create a second plot focusing on specific benchmarks of interest
+# plt.figure(figsize=(20, 8))
+
+# Define benchmarks and SFT percentages
+benchmarks = [
+    'Avg.',
+    'GSM8K',
+    'HumanEval+',
+    'Safety',
+    'TruthfulQA',
+]
+sft_percentages = ['5%', '10%', '25%', '50%', '75%', '100%']
+# colors = ['#0A2B35', '#0fcb8c', '#105257', '#f0529c', '#838383', '#0a3235']  # One color for each percentage
+colors = [
+    '#FAC4DD',  # 10%
+    '#F8ADD0',  # 20%
+    '#F697C3',  # 40%
+    '#F480B6',  # 60%
+    '#F269A9',  # 80%
+    '#F0529C',  # 100% - original pink
+]
+
+colors = [
+    "#E7EEEE",  # RGB(231, 238, 238)
+    "#CEDCDD",  # RGB(206, 220, 221)
+    "#B7CBCC",  # RGB(183, 203, 204)
+    "#9FB9BB",  # RGB(159, 185, 187)
+    "#88A8AB",  # RGB(136, 168, 171)
+    "#F0529C", # PINK
+    "#6E979A",  # RGB(110, 151, 154)
+    "#588689",  # RGB(88, 134, 137)
+    "#3F7478",  # RGB(63, 116, 120)
+    "#105257",  # RGB(16, 82, 87)
+    "#0A3235",  # RGB(10, 50, 53)
+]
+
+# Set up the plot
+fig, ax = plt.subplots(figsize=(20, 8))
+
+# Width of bars and positions
+width = 0.12
+n_percentages = len(sft_percentages)
+
+# Create bars for each benchmark
+for i, benchmark in enumerate(benchmarks):
+    data = benchmark_data[benchmark]
+    values = [
+        data["sft_5"],
+        data["sft_10"],
+        data["sft_25"],
+        data["sft_50"],
+        data["sft_75"],
+        data["sft_full"]
+    ]
+    
+    # Calculate positions for this benchmark's group of bars
+    x = i
+    for j in range(n_percentages):
+        bar_position = x - (n_percentages-1)*width/2 + j*width
+        bar = ax.bar(bar_position, values[j], width, 
+                    label=sft_percentages[j] if i == 0 else "", 
+                    color=colors[j],
+                    edgecolor="black")
+        
+        # Add value labels on top of bars
+        # ax.text(bar_position, values[j], f'{values[j]:.1f}', ha='center', va='bottom', fontsize=8)
+
+# Customize the plot
+# ax.set_xlabel('Benchmarks', fontsize=14)
+ax.set_ylabel('Performance', fontsize=18)
+plt.tick_params(axis='y', labelsize=18)
+# ax.set_title('Performance by Benchmark and SFT Percentage', fontsize=14)
+
+# Set x-axis ticks and labels
+ax.set_xticks(range(len(benchmarks)))
+ax.set_xticklabels(benchmarks, ha="center", fontsize=18)
+
+ax.spines[["right", "top"]].set_visible(False)
+
+# Add legend
+# ax.legend(title='SFT Sample Size', loc='center', bbox_to_anchor=(0.885, 0.8))
+
+# Add grid
+# ax.grid(True, linestyle='--', alpha=0.3, axis='y')
+
+# Adjust layout to accommodate legend
+# plt.subplots_adjust(right=0.85)
+
+# Save and show the plot
+plt.savefig('downsampling_bars.pdf', bbox_inches='tight', dpi=300)
+plt.show()
+
+# # Define specific benchmarks and their colors
+# plot_config = {
+#     'Avg.': '#0a3235',         # Black for average
+#     'TruthfulQA': '#b11bE8',   # Coral red
+#     'HumanEval+': '#f0529c',   # Turquoise
+#     'Safety': '#105257',       # Light blue
+#     'GSM8K': '#0fcb8c'         # Sage green
+# }
+
+# # Plot each benchmark with its specified color
+# for benchmark, color in plot_config.items():
+#     data = benchmark_data[benchmark]
+#     y_values = [
+#         data["sft_5"],
+#         data["sft_10"],
+#         data["sft_25"],
+#         data["sft_50"],
+#         data["sft_75"],
+#         data["sft_full"]
+#     ]
+#     # Make average line dashed and thicker
+#     if benchmark == 'Avg.':
+#         plt.plot(x_values, y_values, '--', marker='s', label=benchmark, 
+#                 color=color, linewidth=3)
+#     else:
+#         plt.plot(x_values, y_values, marker='o', label=benchmark, 
+#                 color=color, linewidth=2)
+
+# # Customize the focused plot
+# plt.xlabel('SFT Percentage', fontsize=12)
+# plt.ylabel('Performance', fontsize=12)
+# # plt.title('Selected Benchmark Performance Trends', fontsize=14)
+# plt.grid(True, linestyle='--', alpha=0.7)
+# plt.legend(fontsize=10)
+# plt.xticks(x_values)
+
+# # Adjust layout
+# plt.tight_layout()
+
+# # Show the plot
+# # plt.show()
+
+# plt.savefig('downsampling.pdf', bbox_inches='tight', dpi=300)
+# plt.close()
\ No newline at end of file
diff --git a/scripts/plot-versions-sft.py b/scripts/plot-versions-sft.py
new file mode 100644
index 000000000..7947a1946
--- /dev/null
+++ b/scripts/plot-versions-sft.py
@@ -0,0 +1,177 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Create dictionary with all models
+data = {
+    # "L3.1-8B-v3.9-nc-fixed-2-pif_uf_hs_dpo___model__42__1730613882": {
+    #     "Rank": 1, "Average": 62.42, "alpaca_eval": 28.6, "BBH": 68.9,
+    #     "codex_humaneval": 85.1, "codex_humanevalplus": 81.4, "drop": 61.3,
+    #     "GSM8K": 82.3, "IFEval": 78.4, "MATH": 41.2,
+    #     "mmlu:cot::summarize": float('nan'), "MMLU": 63.1,
+    #     "Safety": 76.5, "popqa": 29.1, "truthfulqa": 54.9
+    # },
+    # "fae_dpo_on_L3.1-8B-v3.9-nc-fixed-2_add_shp___model__42__1730847906": {
+    #     "Rank": 2, "Average": 61.01, "alpaca_eval": 27.1, "BBH": 65.0,
+    #     "codex_humaneval": 83.9, "codex_humanevalplus": 78.2, "drop": 58.0,
+    #     "GSM8K": 82.7, "IFEval": 78.6, "MATH": 41.9,
+    #     "mmlu:cot::summarize": float('nan'), "MMLU": 64.8,
+    #     "Safety": 76.1, "popqa": 29.1, "truthfulqa": 48.5
+    # },
+    "Tülu v3.7": {
+        "Rank": 3, "Average": 60.48, "alpaca_eval": 13.7, "BBH": 67.8,
+        "codex_humaneval": 87.2, "codex_humanevalplus": 83.6, "drop": 60.6,
+        "GSM8K": 75.1, "IFEval": 72.5, "MATH": 32.6,
+        "mmlu:cot::summarize": 65.1, "MMLU": 63.8,
+        "Safety": 94.7, "popqa": 29.4, "truthfulqa": 44.7
+    },
+    "Tülu v3.8": {
+        "Rank": 4, "Average": 60.12, "alpaca_eval": 12.0, "BBH": 67.9,
+        "codex_humaneval": 85.8, "codex_humanevalplus": 81.1, "drop": 60.4,
+        "GSM8K": 77.2, "IFEval": 72.1, "MATH": 32.5,
+        "mmlu:cot::summarize": 65.3, "MMLU": 63.2,
+        "Safety": 93.5, "popqa": 29.3, "truthfulqa": 46.5
+    },
+    "Tülu v3.9": {
+        "Rank": 5, "Average": 60.08, "alpaca_eval": 12.4, "BBH": 67.9,
+        "codex_humaneval": 86.2, "codex_humanevalplus": 81.4, "drop": 61.3,
+        "GSM8K": 76.2, "IFEval": 72.8, "MATH": 31.5,
+        "mmlu:cot::summarize": float('nan'), "MMLU": 62.1,
+        "Safety": 93.1, "popqa": 29.3, "truthfulqa": 46.8
+    },
+    "Tülu v3.4": {
+        "Rank": 6, "Average": 56.79, "alpaca_eval": 11.4, "BBH": 65.3,
+        "codex_humaneval": 86.2, "codex_humanevalplus": 78.3, "drop": 55.8,
+        "GSM8K": 76.3, "IFEval": 52.9, "MATH": 25.5,
+        "mmlu:cot::summarize": 62.0, "MMLU": 64.8,
+        "Safety": 89.6, "popqa": 23.5, "truthfulqa": 51.9
+    },
+    "Tülu v3.1": {
+        "Rank": 7, "Average": 55.46, "alpaca_eval": 10.5, "BBH": 64.6,
+        "codex_humaneval": 83.8, "codex_humanevalplus": 80.8, "drop": 64.7,
+        "GSM8K": 74.5, "IFEval": 52.5, "MATH": 19.5,
+        "mmlu:cot::summarize": 63.7, "MMLU": 64.6,
+        "Safety": 70.3, "popqa": 31.4, "truthfulqa": 48.3
+    },
+    "Tülu v3.0": {
+        "Rank": 8, "Average": 55.18, "alpaca_eval": 11.3, "BBH": 63.3,
+        "codex_humaneval": 85.4, "codex_humanevalplus": 81.2, "drop": 62.5,
+        "GSM8K": 72.9, "IFEval": 48.8, "MATH": 24.2,
+        "mmlu:cot::summarize": 62.8, "MMLU": 65.1,
+        "Safety": 68.0, "popqa": 31.2, "truthfulqa": 48.2
+    },
+    # "Tülu v3.2": {
+    #     "Rank": 9, "Average": 55.05, "alpaca_eval": 12.1, "BBH": 66.5,
+    #     "codex_humaneval": 84.2, "codex_humanevalplus": 79.7, "drop": 63.1,
+    #     "GSM8K": 73.1, "IFEval": 49.7, "MATH": 19.0,
+    #     "mmlu:cot::summarize": 63.7, "MMLU": 64.1,
+    #     "Safety": 68.9, "popqa": 31.6, "truthfulqa": 49.2
+    # },
+    # "hf-llama-3-tulu-2-dpo-8b": {
+    #     "Rank": 10, "Average": 49.49, "alpaca_eval": 14.1, "BBH": 57.3,
+    #     "codex_humaneval": 69.2, "codex_humanevalplus": 67.7, "drop": 58.3,
+    #     "GSM8K": 63.6, "IFEval": 48.8, "MATH": 13.5,
+    #     "mmlu:cot::summarize": float('nan'), "MMLU": 61.8,
+    #     "Safety": 57.9, "popqa": 24.6, "truthfulqa": 59.8
+    # },
+    "Tülu v2.0": {
+        "Rank": 11, "Average": 48.30, "alpaca_eval": 8.9, "BBH": 57.1,
+        "codex_humaneval": 66.9, "codex_humanevalplus": 63.1, "drop": 61.7,
+        "GSM8K": 60.4, "IFEval": 42.3, "MATH": 14.0,
+        "mmlu:cot::summarize": float('nan'), "MMLU": 61.8,
+        "Safety": 70.7, "popqa": 23.3, "truthfulqa": 49.4
+    }
+}
+
+# Replace this dictionary with your preferred hex colors for each model
+colors = {
+    "Tülu v2.0": "#F7C8E2",
+    "Tülu v3.0": "#E7EEEE",  # RGB(231, 238, 238)
+    "Tülu v3.1": "#CEDCDD",  # RGB(206, 220, 221)
+    "Tülu v3.4": "#9FB9BB",
+    "Tülu v3.7": "#88A8AB",
+    # "Tülu v3.2": "#000080",
+    "Tülu v3.8": "#6E979A",
+    # "Tülu v3.7": "#588689",
+    # "Tülu v3.8": "#3F7478",
+    "Tülu v3.9": "#F0529C",
+    "fae_dpo_on_L3.1-8B-v3.9-nc-fixed-2_add_shp___model__42__1730847906": "#00FF00",
+    "L3.1-8B-v3.9-nc-fixed-2-pif_uf_hs_dpo___model__42__1730613882": "#FF0000",
+    "hf-llama-3-tulu-2-dpo-8b": "#808000",
+}
+
+    # "#B7CBCC",  # RGB(183, 203, 204)
+    # "#9FB9BB",  # RGB(159, 185, 187)
+    # "#88A8AB",  # RGB(136, 168, 171)
+    # "#6E979A",  # RGB(110, 151, 154)
+    # "#588689",  # RGB(88, 134, 137)
+    # "#3F7478",  # RGB(
+    # )
+    # "#105257",  # RGB(16, 82, 87)
+    # "#0A3235",  # RGB(10, 50, 53)
+    # "#F0529C", # PINK
+
+# Convert dictionary to DataFrame
+df = pd.DataFrame.from_dict(data, orient='index')
+
+# Get metrics (excluding Rank and Average)
+# metrics = [col for col in df.columns if col not in ['Rank']]
+
+metrics = [
+    "Average",
+    "BBH",
+    "GSM8K",
+    "IFEval",
+    "MATH",
+    "MMLU",
+    "Safety",
+]
+
+# Set up the plot
+fig, ax = plt.subplots(figsize=(15, 8))
+
+# Create the grouped bar chart
+# plt.figure(figsize=(20, 10))
+
+# Set the width of each bar and positions of the bars
+width = 0.08  # Reduced width to accommodate more bars
+x = np.arange(len(metrics))
+
+# Create bars for each model
+for i, (model, model_data) in enumerate(sorted(df.iterrows())):
+    plt.bar(x + i*width, 
+            model_data[metrics], 
+            width, 
+            label=model.split('___')[0] if '___' in model else model,
+            color=colors[model],
+            edgecolor="black")
+
+# Customize the plot
+# plt.xlabel('Metrics', fontsize=12)
+# plt.ylabel('Score', fontsize=12)
+# plt.title('Model Performance Comparison Across Different Metrics', fontsize=14)
+    
+# Customize the plot
+# ax.set_xlabel('Benchmarks', fontsize=14)
+ax.set_ylabel('Performance', fontsize=18)
+plt.tick_params(axis='y', labelsize=18)
+# ax.set_title('Performance by Benchmark and SFT Percentage', fontsize=14)
+
+# Set x-axis ticks and labels
+ax.set_xticks(range(len(metrics)))
+ax.set_xticklabels(metrics, ha="center", fontsize=18)
+
+ax.spines[["right", "top"]].set_visible(False)
+
+# Add legend
+
+plt.xticks(x + width * len(df)/2, metrics, ha='center')
+# plt.legend(bbox_to_anchor=(0.6, 0.75), loc='upper left')
+# plt.grid(True, alpha=0.3)
+
+# Adjust layout to prevent label cutoff
+plt.tight_layout()
+
+# Save and show the plot
+plt.savefig('tulu_version_bars.pdf', bbox_inches='tight', dpi=300)
+plt.show()
\ No newline at end of file
diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
index a0f06da44..f55f4a1c3 100755
--- a/scripts/submit_eval_jobs.py
+++ b/scripts/submit_eval_jobs.py
@@ -76,11 +76,11 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
 parser.add_argument("--beaker_image", type=str, default="nathanl/open_instruct_auto", help="If given, use this Beaker image.")
 parser.add_argument("--beaker_subfolder", type=str, default=None)
 parser.add_argument("--cluster", nargs='+', default=[
-    "ai2/allennlp-cirrascale",
-    "ai2/general-cirrascale",
-    "ai2/s2-cirrascale-l40",
+    # "ai2/allennlp-cirrascale",
+    # "ai2/general-cirrascale",
+    # "ai2/s2-cirrascale-l40",
     "ai2/allennlp-elara-cirrascale",
-    "ai2/pluto-cirrascale",
+    # "ai2/pluto-cirrascale",
     "ai2/neptune-cirrascale",
     "ai2/saturn-cirrascale",
     "ai2/jupiter-cirrascale-2",
@@ -470,9 +470,11 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
         task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", f"--model_name_or_path {model_info[1]} --hf_revision {args.hf_revision}")]
         task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", f"--tokenizer_name_or_path {model_info[1]}")]
     elif model_info[1].startswith("/"):  # if it's a local model, load it from the local directory
-        assert nfs_available, "NFS is required for path-based models."  # to be safe.
+        # assert nfs_available, "NFS is required for path-based models."  # to be safe.
         task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", f"--model_name_or_path {model_info[1]}")]
         task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", f"--tokenizer_name_or_path {model_info[1]}")]
+    elif model_info[1].startswith("weka"):
+        task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1].split(':/')[1])]
     else:  # if it's a beaker model, mount the beaker dataset to `/model`
         task_spec['datasets'][1]['source']['beaker'] = model_info[1]
 
@@ -590,6 +592,8 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
     ## model location munging: if beaker, use beaker://. If hf, just name
     if model_info[0].startswith("hf-"):
         oe_eval_cmd += f" --model-location {model_info[1]}"
+    elif "weka" in model_info[1]:
+        oe_eval_cmd += f" --model-location {model_info[1]}"
     else:
         oe_eval_cmd += f" --model-location beaker://{model_info[1]}"
     if args.hf_revision:
@@ -639,6 +643,8 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
         assert nfs_available, "NFS is required for path-based models."  # to be safe.
         task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])]
         task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", "--tokenizer_name_or_path "+model_info[1])]
+    elif model_info[1].startswith("weka"):
+        task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1].split(':/')[1])]
     else:  # if it's a beaker model, mount the beaker dataset to `/model`
         task_spec['datasets'][1]['source']['beaker'] = model_info[1]
 
@@ -651,6 +657,7 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
 
     # add gpu information.
     # we just assume you want to use all the gpus for one task at a time
+    task_spec['resources']['gpuCount'] = 8
     num_gpus = task_spec['resources']['gpuCount']
     task_spec["arguments"][0]+= f" --min_gpus_per_task {num_gpus}"
 
diff --git a/scripts/submit_finetune_job.py b/scripts/submit_finetune_job.py
index 7b7e7609f..9ab0330de 100644
--- a/scripts/submit_finetune_job.py
+++ b/scripts/submit_finetune_job.py
@@ -166,7 +166,7 @@ def parse_args(args):
     d['tasks'][0]['arguments'][0] = new_arguments
 
     # name and description
-    exp_name = f"open_instruct_finetune_{model_name}_{now}"
+    exp_name = f"open_instruct_finetune_{model_name}_{now}"[:128]
     d['description'] = exp_name
     d['tasks'][0]['name'] = exp_name
 
@@ -199,6 +199,9 @@ def parse_args(args):
                 },
             ]
 
+    if "google" in args.cluster:
+        d["tasks"][0]["datasets"].pop(0)
+
     # WANDB settings
     for env in d['tasks'][0]['envVars']:
         if env['name'] == "WANDB_DISABLED":
diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py
new file mode 100644
index 000000000..061dc03f8
--- /dev/null
+++ b/scripts/submit_merge_job.py
@@ -0,0 +1,124 @@
+import copy
+import subprocess
+import yaml
+import re
+import itertools
+from datetime import date
+import argparse
+import os
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--workspace", type=str, default="tulu-3-dev")
+    parser.add_argument("--beaker_image", type=str, default="nathanl/open_instruct_auto", help="If given, use this Beaker image.")
+    parser.add_argument("--beaker_config", type=str, default="configs/beaker_configs/default_merge.yaml")
+    parser.add_argument("--merge_config", type=str, default="configs/merge_configs/example_linear_merge_config.yaml")
+    parser.add_argument("--cluster", nargs='+', default=["ai2/neptune-cirrascale", "ai2/saturn-cirrascale", "ai2/jupiter-cirrascale-2"])
+    parser.add_argument("--priority", type=str, default="high")
+    parser.add_argument("--preemptible", action="store_true", default=True, help="for using preemtipble jobs (required on some instances)")
+    parser.add_argument("--output_dir", type=str, default="/output")
+    args = parser.parse_args()
+
+    with open(args.merge_config, 'r') as f:
+        default_yaml = f.read()
+    mergeConfig = yaml.load(default_yaml, Loader=yaml.FullLoader)
+
+    # TODO: support SLERP
+    assert mergeConfig["merge_method"] in ["linear", "task_arithmetic"], f"merging method {mergeConfig['merge_method']} not supported"
+
+    with open(f"configs/merge_configs/base_configs/default_{mergeConfig['merge_method']}_merge.yaml", 'r') as f:
+        merge_yaml = f.read()
+    baseConfig = yaml.load(merge_yaml, Loader=yaml.FullLoader)
+
+    baseConfig["normalize"] = mergeConfig["normalize"]
+    baseConfig["models"] = []
+
+    if mergeConfig["merge_method"] == "task_arithmetic":
+        baseConfig["models"].append({
+            "model": mergeConfig["base_model"]
+        })
+        baseConfig["base_model"] = mergeConfig["base_model"]
+
+    beakerDatasets = []
+    wekaBuckets = set()
+    for elem in mergeConfig["models"]:
+    #   - model: /model-one
+    #     parameters:
+    #       weight: 1.0
+        
+    #   - name: name
+    #     location: beaker
+    #     path: jacobm/beaker-dataset
+    #     weight: 0.5
+        if elem["location"] == "beaker":
+            model_data = {
+                "model": f"/{elem['name']}",
+                "parameters": {"weight": float(elem["weight"])}
+            }
+            if mergeConfig["merge_method"] == "task_arithmetic":
+                model_data["parameters"]["normalize"] = mergeConfig["normalize"]
+            # beakerConfig['datasets'][1]['source']['beaker'] = model_info[1]
+    #   - mountPath: /hf_llama_models
+    #     source:
+    #       beaker: Yizhongw03/hf_llama_model_7B
+            beakerDatasets.append({
+                "mountPath": f"/{elem['name']}",
+                "source": {"beaker": elem["path"]}
+            })
+            # mount datasets
+        elif elem["location"] in ["huggingface", "nfs"]:
+            model_data = {
+                "model": elem['path'],
+                "parameters": {"weight": float(elem["weight"])}
+            }
+            if mergeConfig["merge_method"] == "task_arithmetic":
+                model_data["parameters"]["normalize"] = mergeConfig["normalize"]
+        elif elem["location"] == "weka": # verify the only available cluster(s) have weka
+            if elem["wekaBucket"] not in wekaBuckets:
+                beakerDatasets.append({
+                    "mountPath": f"/{elem['wekaBucket']}",
+                    "source": {"weka": elem["wekaBucket"]}
+                })
+                wekaBuckets.add(elem["wekaBucket"])
+            model_data = {
+                "model": elem["path"],
+                "parameters": {"weight": float(elem["weight"])}
+            }
+            if mergeConfig["merge_method"] == "task_arithmetic":
+                model_data["parameters"]["normalize"] = mergeConfig["normalize"]
+        else:
+            print(f"Unsupported location: {elem['location']}")
+        baseConfig["models"].append(model_data)
+
+    with open(args.beaker_config, 'r') as f:
+        beaker_yaml = f.read()
+    beakerConfig = yaml.load(beaker_yaml, Loader=yaml.FullLoader)
+
+    beakerConfig['tasks'][0]['image']['beaker'] = args.beaker_image
+    # TODO: fix these
+    beakerConfig['tasks'][0]['constraints']['cluster'] = args.cluster
+    beakerConfig['tasks'][0]['context']['priority'] = args.priority
+    beakerConfig['tasks'][0]['context']['preemptible'] = args.preemptible # True required for Jupiter/Pluto
+
+    print(beakerConfig)
+    
+    if len(beakerDatasets) > 0:
+        beakerConfig["tasks"][0]["datasets"] = beakerDatasets
+    base_command = beakerConfig["tasks"][0]["arguments"][0].replace("{OUTPUT_DIR}", args.output_dir)
+    beakerConfig["tasks"][0]["arguments"][0] = base_command.replace("{RAW_CONFIG}", f'"{str(baseConfig)}"')
+
+    experiment_name = f"open_instruct_merge_models" 
+    beakerConfig["description"] = experiment_name
+    # if configs/beaker_configs/auto_created doesn't exist, create it with os
+    if not os.path.exists("configs/beaker_configs/auto_created"):
+        os.makedirs("configs/beaker_configs/auto_created")
+    fn = "configs/beaker_configs/auto_created/{}.yaml".format(experiment_name)
+    os.makedirs(os.path.dirname(fn), exist_ok=True)
+    with open(fn, "w") as file:
+        yaml.dump(beakerConfig, file, default_flow_style=True)    
+
+    cmd = "beaker experiment create {} --workspace ai2/{}".format(fn, args.workspace)
+    subprocess.Popen(cmd, shell=True)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/scripts/table-script.py b/scripts/table-script.py
new file mode 100644
index 000000000..ff199b035
--- /dev/null
+++ b/scripts/table-script.py
@@ -0,0 +1,331 @@
+import pandas as pd
+import argparse
+import sys
+
+"""
+Examples:
+
+python -m plots.table-tulu3-90 --csv-path leaderboard/exported_results.csv --models ppo_ray_β_0.03__3__1730357435 Meta-Llama-3.1-8B-Instruct hf-ministral_8b_instruct_2410 hf-qwen2_5_7b_instruct valpy_dpo_70b_hslj_uflj_dalj_wciflj_iftaxlj_wcunusedlj hf-Llama-3.1-70B-Instruct hf-qwen2_5_72b_instruct
+
+8B
+python -m plots.table-tulu3-90 --csv-path leaderboard/exported_results.csv --models Meta-Llama-3.1-8B-Instruct hf-google_gemma-2-9b-it hf-NousResearch-Hermes-3-Llama-3.1-8B hf-qwen2_5_7b_instruct hf-ministral_8b_instruct_2410 L3.18B-math-mix-final-nc__meta-llama_Llama-3.1-8B__42__1729284525 dpo_tune___model__42__1729311739 ppo_ray_β_0.03__3__1730357435
+
+70B
+python -m plots.table-tulu3-90 --csv-path leaderboard/exported_results.csv --models hf-Meta-Llama-3.1-70B-Instruct hf-qwen2_5_72b_instruct hf-NousResearch-Hermes-3-Llama-3.1-70B hf-llama_3_1_nemotron_70B_instruct_hf L3.1-70B-v3.8-lr_2e-6-2_epochs 70B_ppo_ray_β_0.07_lr_1e-7__3__1730258118 L3.1-70B-v3.8-lr_2e-6-2_epochs-pif_dpo-5e-7 
+
+Merging example:
+python table-tulu3.py --csv-path ~/Downloads/exported_results_4.csv --models L3.1-8B-v3.8-nc-soup L3.1-8B-v3.9-nc-3__meta-llama_Llama-3.1-8B__456__1730332817 L3.1-8B-v3.9-nc-2__meta-llama_Llama-3.1-8B__123__1730333671 L3.1-8B-v3.9-nc__meta-llama_Llama-3.1-8B__42__1730330678
+"""
+
+model_label_conversion = {
+    # llamas
+    "Meta-Llama-3.1-8B-Instruct": "Llama 3.1 8B Instruct",
+    "hf-Llama-3.1-70B-Instruct": "Llama 3.1 70B Instruct",
+    "hf-Meta-Llama-3.1-70B-Instruct": "Llama 3.1 70B Instruct",
+    #
+    "hf-llama-3-tulu-2-8b": "Tulu 2 SFT",
+    "hf-llama-3-tulu-2-dpo-8b": "Tulu 2 + DPO",
+    "L3.1-8B-v3.8-nc-final__meta-llama_Llama-3.1-8B__42__1729991287": "Tulu 3 SFT",
+    "L3.1-8B-v3.8-wip-persona_code_v3-2-pif_dpo___model__42__1729725103": "Tulu 3 + DPO",
+    "ljrmvalue_lj_gsm_data_step_300": "Tulu 3 + RL",
+    "hf-NousResearch-Hermes-3-Llama-3.1-8B": "Hermes 3 8B",
+    "hf-NousResearch-Hermes-3-Llama-3.1-70B": "Hermes 3 70B",
+    "hf-llama_3_tulu_2_dpo_70b": "Tulu 2 + DPO 70B",
+    "L3.1-70B-v3.7-nc": "Tulu 3 70B SFT",
+    "hf-google_gemma-2-9b-it": "Gemma 2 9B",
+    "hf-ministral_8b_instruct_2410": "Ministral 8B",
+    "hf-magpielm_8b_chat_v0_1": "Magpie 8B",
+    "hf-gemma_2_9b_it_simpo": "Gemma 2 9B SimPO",
+    "L3.1-8B-v3.8-nc-soup-pif_dpo-soup": "Tulu 3 + Merging + DPO",
+    "L3.1-8B-v3.8-nc-soup": "Tulu 3 SFT Merge",
+    "L3.1-8B-v3.9-nc-3__meta-llama_Llama-3.1-8B__456__1730332817": "Seed 1",
+    "L3.1-8B-v3.9-nc-2__meta-llama_Llama-3.1-8B__123__1730333671": "Seed 2",
+    "L3.1-8B-v3.9-nc__meta-llama_Llama-3.1-8B__42__1730330678": "Seed 3",
+    # random SFT mixes
+    "fae_llama3_sftmix_v3.4_personahub_if_v1__meta-llama_Meta-Llama-3-8B__42__1728059424": "Tulu v3.4 SFT",
+    "sft_preview_mix_v3.5.10__meta-llama_Llama-3.1-8B__42__1729148912": "Tulu v3.6 SFT",
+    "L3.18B-v3.7-c__meta-llama_Llama-3.1-8B__42__1729454073": "Tulu v3.7 SFT",
+    "L3.1-8B-v3.8-nc-final__meta-llama_Llama-3.1-8B__42__1729991287": "Tulu v3.8 SFT",
+    "L3.1-8B-v3.8-nc-soup": "Tulu v3.8 SFT + Merging",
+    "hf-llama_3_tulu_2_70b": "Tulu 2 SFT 70B",
+    "L3.1-70B-v3.8-lr_2e-6-2_epochs-pif_dpo-5e-7": "Tulu 3 DPO 70B",
+    "L3.1-70B-v3.8-lr_2e-6-2_epochs": "Tulu 3 SFT 70B",
+    # 7b rivals
+    "hf-qwen2_5_7b_instruct": "Qwen 2.5 7B Instruct",
+    "hf-ministral_8b_instruct_2410": "Ministral 8B Instruct",
+    "hf-google_gemma-2-9b-it": "Gemma 2 9B",
+    "hf-gemma_2_9b_it_simpo": "Gemma 2 9B SimPO",
+    # 70b rivalsqw
+    "hf-llama_3_1_nemotron_70b_instruct_hf": "Nemotron Llama 3.1 70B",
+    "hf-llama_3_1_nemotron_70B_instruct_hf": "Nemotron Llama 3.1 70B",
+    "hf-qwen2_5_72b_instruct": "Qwen 2.5 72B",
+    # LMSYS version compare
+    "L3.18B-math-mix-final-nc__meta-llama_Llama-3.1-8B__42__1729284525": "Tulu 3 SFT",
+    "dpo_tune___model__42__1729311739": "Tulu 3 DPO",
+    "ppo_ray_β_0.03__3__1730357435": "Tulu 3 8B",
+    # 70b fine tunes
+    "L3.1-70B-v3.8-lr_2e-6-2_epochs-pif_dpo-5e-7": "Tulu 70B DPO",
+    "70B_ppo_ray_β_0.07_lr_1e-7__3__1730258118": "Tulu 70B RL",
+    "valpy_dpo_70b_hslj_uflj_dalj_wciflj_iftaxlj_wcunusedlj": "Tulu 3 70B",
+    "hf-NousResearch-Hermes-3-Llama-3.1-8B": "Hermes 3 8B",
+    "hf-llama-3-tulu-2-8b": "Tulu 2 8B SFT",
+    "L3.1-8B-v3.9-nc-fixed-2__meta-llama_Llama-3.1-8B__123__1730531285": "Tulu 3 8B SFT",
+    "hf-NousResearch-Hermes-3-Llama-3.1-70B": "Hermes 3 70B",
+    "hf-llama-3-tulu-2-70b": "Tulu 2 70B SFT",
+    "L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-3__meta-llama_Llama-3.1-70B__456__1731059165": "Tulu 3 70B SFT",
+    "L3.1-8B-v3.9-nc-no-safety__meta-llama_Llama-3.1-8B__42__1731562927": "Tulu 3 8B SFT w/o Safety",
+    "L3.1-8B-v3.9-nc-no-wc__meta-llama_Llama-3.1-8B__42__1731562946": "Tulu 3 8B SFT w/o WildChat",
+    "L3.1-8B-v3.9-nc-no-synthetic__meta-llama_Llama-3.1-8B__42__1731613382": "Tulu 3 8B SFT w/o Synthetic Data (ours)",
+    "L3.1-8B-v3.9-nc-no-math__meta-llama_Llama-3.1-8B__42__1731562937": "Tulu 3 8B SFT w/o Mathematics",
+    "hf-RLHFlow-LLaMA3-SFT-v2": "RLHFlow SFT V2",
+    "hf-MAmmoTH2-8B": "MAmmoTH2 8B",
+
+    # downsampling
+    "L3.1-8B-v3.9-nc-downsample-0.05__meta-llama_Llama-3.1-8B__42__1731214637": "Tulu 3 8B SFT (5\%)",
+    "L3.1-8B-v3.9-nc-downsample-0.10__meta-llama_Llama-3.1-8B__42__1731214619": "Tulu 3 8B SFT (10\%)",
+    "L3.1-8B-v3.9-nc-downsample-0.25__meta-llama_Llama-3.1-8B__42__1731214572": "Tulu 3 8B SFT (25\%)",
+    "L3.1-8B-v3.9-nc-downsample-0.50__meta-llama_Llama-3.1-8B__42__1731214572": "Tulu 3 8B SFT (50\%)",
+    "L3.1-8B-v3.9-nc-downsample-0.75__meta-llama_Llama-3.1-8B__42__1731214576": "Tulu 3 8B SFT (75\%)",
+}
+
+# Metric keys definition
+metric_keys = {
+    "MMLU": "mmlu:mc::tulu",
+    "TruthfulQA": "truthfulqa",
+    "PopQA": "popqa",
+    "BigBenchHard": "bbh:cot::tulu",
+    "HumanEval": "codex_humaneval",
+    "HumanEval+": "codex_humanevalplus",
+    "GSM8K": "gsm8k",
+    "DROP": "drop",
+    "MATH": "math::flex",
+    "IFEval": "ifeval",
+    "AlpacaEval 2": "alpaca_eval",
+    "Safety": "overall_oe_safety_average",
+}
+
+eval_settings = {
+    "MMLU": "5 shot",
+    "TruthfulQA": "6 shot",
+    "PopQA": "15 shot",
+    "BigBenchHard": "3 shot, CoT",
+    "HumanEval": "pass@10",
+    "HumanEval+": "pass@10",
+    "GSM8K": "8 shot, CoT",
+    "DROP": "3 shot",
+    "MATH": "4 shot CoT, Flex",
+    "IFEval": "Strict",
+    "AlpacaEval 2": "LC \% win",
+    "Safety": "",
+}
+
+# Change this to change the table size
+AVERAGE_KEYS = [
+    "alpaca_eval",
+    "bbh:cot::tulu",
+    "codex_humaneval",
+    "codex_humanevalplus",
+    "drop",
+    "gsm8k",
+    "ifeval",
+    "math::flex",
+    "mmlu:mc::tulu",
+    "popqa",
+    "truthfulqa",
+    "overall_oe_safety_average",
+]
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Create a table of model performance metrics.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    # Required arguments
+    parser.add_argument(
+        "--csv-path", required=True, help="Path to the CSV file containing the results"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        required=True,
+        help="List of model names to generate table for",
+    )
+    parser.add_argument(
+        "--markdown",
+        action="store_true",
+        help="Output in Markdown format instead of LaTeX",
+    )
+    parser.add_argument(
+        "--extra_cols",
+        type=int,
+        default=0,
+        help="Number of extra columns to add to the table",
+    )
+
+    return parser.parse_args()
+
+
+def format_value(value, markdown=False):
+    """Format a numeric value for table output."""
+    if pd.isna(value):
+        return "N/A"
+    try:
+        return f"{float(value):.1f}"
+    except:
+        return "N/A"
+
+
+def create_performance_table_rows(csv_path, model_names, markdown=False, extra_cols=0):
+    """
+    Create performance table rows for the specified models.
+
+    Parameters:
+    csv_path (str): Path to the CSV file containing the results
+    model_names (list): List of model names to generate table for
+    markdown (bool): Whether to output in Markdown format
+    extra_cols (int): Number of extra columns to add to the table
+    """
+
+    try:
+        all_data = {}
+        df = pd.read_csv(csv_path)
+        rows = []
+
+        for model_name in model_names:
+            model_data = df[df["Model"] == model_name]
+            if len(model_data) == 0:
+                print(f"Warning: Model '{model_name}' not found in CSV file")
+                continue
+
+            # Get pretty model name from conversion dictionary
+            pretty_name = model_label_conversion.get(model_name, model_name)
+
+            # Replace "Tulu" with "\modelname" for LaTeX output only
+            if not markdown:
+                pretty_name = pretty_name.replace("Tulu ", "\\modelname~")
+
+            all_data[pretty_name] = {}
+
+            # Calculate average
+            for key in AVERAGE_KEYS:
+                model_data[key] = model_data[key].apply(
+                    lambda x: float(x) if x != "nan" else None
+                )
+            average = model_data[AVERAGE_KEYS].mean(axis=1).iloc[0]
+            all_data[pretty_name]["Avg."] = format_value(average, markdown)
+
+            # add all the eval scores
+            for metric_name, metric_key in metric_keys.items():
+                value = model_data[metric_key].iloc[0]
+                all_data[pretty_name][metric_name] = format_value(value, markdown)
+
+        for metric_name in ["Avg."] + list(metric_keys.keys()):
+            values = [metric_name]
+            if metric_name == "Avg.":
+                values.append("")
+            else:
+                values.append(f"\\small{{{eval_settings[metric_name]}}}")
+            for pretty_name in all_data.keys():
+                values.append(all_data[pretty_name][metric_name])
+
+            values = ["-1" if i == "N/A" else i for i in values]
+            numbers = [float(v) for v in values[2:]]
+            max_index = numbers.index(max(numbers)) + 2
+            values[max_index] = f"\\textbf{{{values[max_index]}}}"
+
+            if markdown:
+                # Markdown table row with pretty name
+                r = f"|  | {' | '.join(values)} |"
+                r += " |" * extra_cols
+                rows.append(r)
+            else:
+                # LaTeX table row with pretty name
+                r = f"{' & '.join(values)}"
+                r += " &" * extra_cols
+                r += " \\\\"
+                rows.append(r)
+            if metric_name == "Avg.":
+                rows.append("\\midrule")
+
+        return rows
+
+    except FileNotFoundError:
+        print(f"Error: Could not find CSV file at {csv_path}")
+        sys.exit(1)
+    except pd.errors.EmptyDataError:
+        print(f"Error: CSV file at {csv_path} is empty")
+        sys.exit(1)
+
+
+def create_latex_table(model_names, extra_cols):
+    """Return the LaTeX table header."""
+    header = """\\begin{table}[]
+\\centering
+\\setlength\\tabcolsep{5pt}
+\\adjustbox{max width=\\linewidth}{
+"""
+    column_spec = "ll"
+    for model_name in model_names:
+        if "Tulu" in model_label_conversion[model_name]:
+            # P is defined via \newcolumntype{P}{>{\columncolor{ai2pink}}c}
+            column_spec += "l"
+        else:
+            column_spec += "c"
+    column_spec += "c" * extra_cols
+
+    header += (
+        """\\begin{NiceTabular}{@{}"""
+        + column_spec
+        + """@{}}
+\\toprule
+"""
+    )
+    header += """\\textbf{Benchmark} & \\textbf{Eval Setting}"""
+    for model_name in model_names:
+        pretty_name = model_label_conversion.get(model_name, model_name)
+        if "Tulu" in pretty_name:
+            pretty_name = pretty_name.replace("Tulu ", "\\modelname~")
+            pretty_name = f"\\textbf{{{pretty_name}}}"
+        header += " & \\rotatebox{90}{" + pretty_name + "}"
+    for i in range(extra_cols):
+        header += " & "
+    header += """\\\\\\midrule"""
+    return header
+
+
+def create_latex_footer():
+    """Return the LaTeX table footer."""
+    return """\\bottomrule
+\\end{NiceTabular}}
+\\vspace{3pt}
+\\caption{TODO}
+\\label{tab:TODO}
+\\end{table}"""
+
+
+def main():
+    """Main function to run the script."""
+    args = parse_args()
+
+    rows = create_performance_table_rows(
+        csv_path=args.csv_path,
+        model_names=args.models,
+        markdown=args.markdown,
+        extra_cols=args.extra_cols,
+    )
+
+    if not args.markdown:
+        print(create_latex_table(model_names=args.models, extra_cols=args.extra_cols))
+
+    for row in rows:
+        print(row)
+
+    if not args.markdown:
+        print(create_latex_footer())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tulu_version_bars.pdf b/tulu_version_bars.pdf
new file mode 100644
index 000000000..2f5f0809b
Binary files /dev/null and b/tulu_version_bars.pdf differ