diff --git a/Dockerfile b/Dockerfile index dd6b95a97..eca45c7bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -91,6 +91,7 @@ RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url h RUN pip install packaging RUN pip install flash-attn==2.6.3 --no-build-isolation RUN pip install -r requirements.txt +RUN pip install git+https://github.com/arcee-ai/mergekit.git # NLTK download RUN python -m nltk.downloader punkt diff --git a/configs/beaker_configs/default_dpo.yaml b/configs/beaker_configs/default_dpo.yaml index 87685b5fe..08eacd1a7 100644 --- a/configs/beaker_configs/default_dpo.yaml +++ b/configs/beaker_configs/default_dpo.yaml @@ -8,7 +8,7 @@ tasks: command: [ '/bin/sh', '-c' ] - arguments: ['PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + arguments: ['pip install --upgrade transformers && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch --mixed_precision bf16 --num_machines 1 --num_processes 4 @@ -37,7 +37,7 @@ tasks: - name: TRANSFORMERS_CACHE value: ./cache/ - name: WANDB_API_KEY - secret: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY - name: WANDB_PROJECT value: open-instruct - name: WANDB_WATCH @@ -47,7 +47,7 @@ tasks: - name: WANDB_DISABLED value: true - name: HF_TOKEN - secret: HF_TOKEN + secret: jacobm_HF_TOKEN datasets: - mountPath: /oe-adapt-default source: diff --git a/configs/beaker_configs/default_eval.yaml b/configs/beaker_configs/default_eval.yaml index 3b4553ed6..ba9f55731 100644 --- a/configs/beaker_configs/default_eval.yaml +++ b/configs/beaker_configs/default_eval.yaml @@ -35,16 +35,19 @@ tasks: - name: WANDB_DISABLED value: true - name: OPENAI_API_KEY - secret: openai_api_key + secret: jacobm_OPENAI_API_KEY - name: HF_TOKEN - secret: HF_TOKEN + secret: jacobm_HF_TOKEN datasets: - - mountPath: /data/ + - mountPath: /oe-adapt-default source: - beaker: hamishivi/open-instruct-eval-data + weka: oe-adapt-default - mountPath: /model source: beaker: 01GVYXDGJC6DV0JW9JZ16YM07G + - mountPath: /data/ + source: + beaker: hamishivi/open-instruct-eval-data - mountPath: /net/nfs.cirrascale source: hostPath: /net/nfs.cirrascale diff --git a/configs/beaker_configs/default_finetune.yaml b/configs/beaker_configs/default_finetune.yaml index bd5e05c06..d10680de5 100644 --- a/configs/beaker_configs/default_finetune.yaml +++ b/configs/beaker_configs/default_finetune.yaml @@ -37,7 +37,7 @@ tasks: - name: TRANSFORMERS_CACHE value: ./cache/ - name: WANDB_API_KEY - secret: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY - name: WANDB_PROJECT value: open-instruct - name: WANDB_WATCH @@ -47,11 +47,14 @@ tasks: - name: WANDB_DISABLED value: true - name: HF_TOKEN - secret: HF_TOKEN + secret: jacobm_HF_TOKEN datasets: - mountPath: /oe-adapt-default source: weka: oe-adapt-default + - mountPath: /oe-training-default + source: + weka: oe-training-default result: path: /output resources: diff --git a/configs/beaker_configs/default_finetune_multinode.yaml b/configs/beaker_configs/default_finetune_multinode.yaml index 03ed976af..7b10fa8c7 100644 --- a/configs/beaker_configs/default_finetune_multinode.yaml +++ b/configs/beaker_configs/default_finetune_multinode.yaml @@ -51,7 +51,7 @@ tasks: - name: TRANSFORMERS_CACHE value: ./cache/ - name: WANDB_API_KEY - secret: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY - name: WANDB_PROJECT value: open-instruct - name: WANDB_WATCH @@ -61,11 +61,14 @@ tasks: - name: WANDB_DISABLED value: true - name: HF_TOKEN - secret: HF_TOKEN + secret: jacobm_HF_TOKEN datasets: - mountPath: /oe-adapt-default source: weka: oe-adapt-default + # - mountPath: /model + # source: + # beaker: jacobm/llama-3.1-8b result: path: /output resources: diff --git a/configs/beaker_configs/default_finetune_multinode_augusta.yaml b/configs/beaker_configs/default_finetune_multinode_augusta.yaml new file mode 100644 index 000000000..3766bdc4b --- /dev/null +++ b/configs/beaker_configs/default_finetune_multinode_augusta.yaml @@ -0,0 +1,128 @@ +version: v2 +description: open-instruct-finetune-multinode +budget: ai2/oe-adapt +tasks: + - name: open-instruct-finetune-multinode + replicas: 4 + leaderSelection: true + hostNetworking: true + propagateFailure: true + propagatePreemption: true + synchronizedStartTimeout: 60m + image: + beaker: nathanl/open_instruct_auto + command: [ + '/bin/sh', '-c' + ] + arguments: [' + unset CUDA_LAUNCH_BLOCKING && export LD_LIBRARY_PATH=/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH} && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + --mixed_precision bf16 + --num_machines 4 + --num_processes 32 + --machine_rank $BEAKER_REPLICA_RANK + --main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME + --main_process_port 29400 + --use_deepspeed + --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf + --deepspeed_multinode_launcher standard + open_instruct/finetune.py + --model_name_or_path meta-llama/Meta-Llama-3-8B + --tokenizer_name meta-llama/Meta-Llama-3-8B + --use_slow_tokenizer + --use_flash_attn + --max_seq_length 4096 + --preprocessing_num_workers 16 + --per_device_train_batch_size 1 + --gradient_accumulation_steps 4 + --learning_rate 5e-6 + --lr_scheduler_type linear + --warmup_ratio 0.03 + --weight_decay 0. + --num_train_epochs 2 + --output_dir /output/ + --with_tracking + --report_to tensorboard + --logging_steps 1 + --reduce_loss sum + '] + envVars: + - name: CUDA_DEVICE_ORDER + value: PCI_BUS_ID + - name: TRANSFORMERS_CACHE + value: ./cache/ + - name: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY + - name: WANDB_PROJECT + value: open-instruct + - name: WANDB_WATCH + value: false + - name: WANDB_LOG_MODEL + value: false + - name: WANDB_DISABLED + value: true + - name: HF_TOKEN + secret: jacobm_HF_TOKEN + - name: NCCL_CROSS_NIC + value: 0 + - name: NCCL_ALGO + value: Ring,Tree + - name: NCCL_PROTO + value: Simple + - name: NCCL_MIN_NCHANNELS + value: 4 + - name: NCCL_P2P_NET_CHUNKSIZE + value: 524288 + - name: NCCL_P2P_PCI_CHUNKSIZE + value: 524288 + - name: NCCL_P2P_NVL_CHUNKSIZE + value: 1048576 + - name: NCCL_FASTRAK_NUM_FLOWS + value: 2 + - name: NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL + value: 0 + - name: NCCL_BUFFSIZE + value: 8388608 + - name: NCCL_FASTRAK_USE_SNAP + value: 1 + - name: CUDA_VISIBLE_DEVICES + value: 0,1,2,3,4,5,6,7 + - name: NCCL_NET_GDR_LEVEL + value: PIX + - name: NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING + value: 0 + - name: NCCL_TUNER_PLUGIN + value: libnccl-tuner.so + - name: NCCL_TUNER_CONFIG_PATH + value: /var/lib/tcpxo/lib64/a3plus_tuner_config.textproto + - name: NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE + value: /var/lib/tcpxo/lib64/a3plus_guest_config.textproto + - name: NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS + value: 600000 + - name: NCCL_NVLS_ENABLE + value: 0 + - name: NCCL_DEBUG + value: WARN + - name: NCCL_FASTRAK_CTRL_DEV + value: enp0s12 + - name: NCCL_FASTRAK_IFNAME + value: enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 + - name: NCCL_SOCKET_IFNAME + value: enp0s12 + - name: NCCL_USE_SNAP + value: 1 + - name: NCCL_FASTRAK_USE_LLCM + value: 1 + - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY + value: /dev/aperture_devices + + datasets: + - mountPath: /oe-adapt-default + source: + weka: oe-adapt-default + result: + path: /output + resources: + gpuCount: 8 + context: + priority: normal + preemptible: true \ No newline at end of file diff --git a/configs/beaker_configs/default_finetune_multinode_olmo.yaml b/configs/beaker_configs/default_finetune_multinode_olmo.yaml new file mode 100644 index 000000000..b42800cee --- /dev/null +++ b/configs/beaker_configs/default_finetune_multinode_olmo.yaml @@ -0,0 +1,78 @@ +version: v2 +description: open-instruct-finetune-multinode +budget: ai2/oe-adapt +tasks: + - name: open-instruct-finetune-multinode + replicas: 4 + leaderSelection: true + hostNetworking: true + propagateFailure: true + propagatePreemption: true + synchronizedStartTimeout: 60m + image: + beaker: nathanl/open_instruct_auto + command: [ + '/bin/sh', '-c' + ] + arguments: [' + unset CUDA_LAUNCH_BLOCKING && pip install git+https://github.com/vwxyzjn/transformers.git@olmo1124_classification && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + --mixed_precision bf16 + --num_machines 4 + --num_processes 32 + --machine_rank $BEAKER_REPLICA_RANK + --main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME + --main_process_port 29400 + --use_deepspeed + --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf + --deepspeed_multinode_launcher standard + open_instruct/finetune.py + --model_name_or_path meta-llama/Meta-Llama-3-8B + --tokenizer_name meta-llama/Meta-Llama-3-8B + --use_slow_tokenizer + --use_flash_attn + --max_seq_length 4096 + --preprocessing_num_workers 16 + --per_device_train_batch_size 1 + --gradient_accumulation_steps 4 + --learning_rate 5e-6 + --lr_scheduler_type linear + --warmup_ratio 0.03 + --weight_decay 0. + --num_train_epochs 2 + --output_dir /output/ + --with_tracking + --report_to tensorboard + --logging_steps 1 + --reduce_loss sum + '] + envVars: + - name: CUDA_DEVICE_ORDER + value: PCI_BUS_ID + - name: TRANSFORMERS_CACHE + value: ./cache/ + - name: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY + - name: WANDB_PROJECT + value: open-instruct + - name: WANDB_WATCH + value: false + - name: WANDB_LOG_MODEL + value: false + - name: WANDB_DISABLED + value: true + - name: HF_TOKEN + secret: jacobm_HF_TOKEN + datasets: + - mountPath: /oe-adapt-default + source: + weka: oe-adapt-default + # - mountPath: /model + # source: + # beaker: jacobm/llama-3.1-8b + result: + path: /output + resources: + gpuCount: 8 + context: + priority: normal + preemptible: true \ No newline at end of file diff --git a/configs/beaker_configs/default_finetune_multinode_olmoe.yaml b/configs/beaker_configs/default_finetune_multinode_olmoe.yaml new file mode 100644 index 000000000..771dc41b7 --- /dev/null +++ b/configs/beaker_configs/default_finetune_multinode_olmoe.yaml @@ -0,0 +1,78 @@ +version: v2 +description: open-instruct-finetune-multinode +budget: ai2/oe-adapt +tasks: + - name: open-instruct-finetune-multinode + replicas: 4 + leaderSelection: true + hostNetworking: true + propagateFailure: true + propagatePreemption: true + synchronizedStartTimeout: 60m + image: + beaker: nathanl/open_instruct_auto + command: [ + '/bin/sh', '-c' + ] + arguments: [' + unset CUDA_LAUNCH_BLOCKING && pip install --upgrade transformers && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + --mixed_precision bf16 + --num_machines 4 + --num_processes 32 + --machine_rank $BEAKER_REPLICA_RANK + --main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME + --main_process_port 29400 + --use_deepspeed + --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf + --deepspeed_multinode_launcher standard + open_instruct/finetune.py + --model_name_or_path meta-llama/Meta-Llama-3-8B + --tokenizer_name meta-llama/Meta-Llama-3-8B + --use_slow_tokenizer + --use_flash_attn + --max_seq_length 4096 + --preprocessing_num_workers 16 + --per_device_train_batch_size 1 + --gradient_accumulation_steps 4 + --learning_rate 5e-6 + --lr_scheduler_type linear + --warmup_ratio 0.03 + --weight_decay 0. + --num_train_epochs 2 + --output_dir /output/ + --with_tracking + --report_to tensorboard + --logging_steps 1 + --reduce_loss sum + '] + envVars: + - name: CUDA_DEVICE_ORDER + value: PCI_BUS_ID + - name: TRANSFORMERS_CACHE + value: ./cache/ + - name: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY + - name: WANDB_PROJECT + value: open-instruct + - name: WANDB_WATCH + value: false + - name: WANDB_LOG_MODEL + value: false + - name: WANDB_DISABLED + value: true + - name: HF_TOKEN + secret: jacobm_HF_TOKEN + datasets: + - mountPath: /oe-adapt-default + source: + weka: oe-adapt-default + # - mountPath: /model + # source: + # beaker: jacobm/llama-3.1-8b + result: + path: /output + resources: + gpuCount: 8 + context: + priority: normal + preemptible: true \ No newline at end of file diff --git a/configs/beaker_configs/default_finetune_olmo.yaml b/configs/beaker_configs/default_finetune_olmo.yaml new file mode 100644 index 000000000..7a3236d17 --- /dev/null +++ b/configs/beaker_configs/default_finetune_olmo.yaml @@ -0,0 +1,65 @@ +version: v2 +description: open-instruct-finetune +budget: ai2/oe-adapt +tasks: + - name: open-instruct-finetune + image: + beaker: nathanl/open_instruct_auto + command: [ + '/bin/sh', '-c' + ] + arguments: ['pip install git+https://github.com/vwxyzjn/transformers.git@olmo1124_classification && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + --mixed_precision bf16 + --num_machines 1 + --num_processes 4 + --use_deepspeed + --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf + open_instruct/finetune.py + --model_name_or_path /hf_llama_models + --use_flash_attn + --max_seq_length 2048 + --preprocessing_num_workers 16 + --per_device_train_batch_size 2 + --gradient_accumulation_steps 16 + --learning_rate 2e-5 + --lr_scheduler_type linear + --warmup_ratio 0.03 + --weight_decay 0. + --num_train_epochs 2 + --output_dir /output/ + --with_tracking + --report_to tensorboard + --logging_steps 1 + '] + envVars: + - name: CUDA_DEVICE_ORDER + value: PCI_BUS_ID + - name: TRANSFORMERS_CACHE + value: ./cache/ + - name: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY + - name: WANDB_PROJECT + value: open-instruct + - name: WANDB_WATCH + value: false + - name: WANDB_LOG_MODEL + value: false + - name: WANDB_DISABLED + value: true + - name: HF_TOKEN + secret: jacobm_HF_TOKEN + datasets: + - mountPath: /oe-adapt-default + source: + weka: oe-adapt-default + - mountPath: /oe-training-default + source: + weka: oe-training-default + result: + path: /output + resources: + gpuCount: 4 + context: + cluster: ai2/allennlp-cirrascale + priority: high + preemptible: false \ No newline at end of file diff --git a/configs/beaker_configs/default_merge.yaml b/configs/beaker_configs/default_merge.yaml new file mode 100644 index 000000000..446acee54 --- /dev/null +++ b/configs/beaker_configs/default_merge.yaml @@ -0,0 +1,40 @@ +version: v2 +description: open-instruct-merge-models +budget: ai2/oe-adapt +tasks: + - name: open-instruct-merge-models + image: + beaker: nathanl/open_instruct_auto + command: [ + '/bin/sh', '-c' + ] + arguments: ['mkdir {OUTPUT_DIR}; echo {RAW_CONFIG} > {OUTPUT_DIR}/config.yaml; mergekit-yaml {OUTPUT_DIR}/config.yaml {OUTPUT_DIR} --cuda'] + envVars: + - name: CUDA_DEVICE_ORDER + value: PCI_BUS_ID + - name: TRANSFORMERS_CACHE + value: ./cache/ + - name: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY + - name: WANDB_PROJECT + value: open-instruct + - name: WANDB_WATCH + value: false + - name: WANDB_LOG_MODEL + value: false + - name: WANDB_DISABLED + value: true + - name: HF_TOKEN + secret: jacobm_HF_TOKEN + result: + path: /output + resources: + gpuCount: 1 + context: + priority: low + preemptible: true + constraints: + cluster: + - ai2/neptune-cirrascale + - ai2/saturn-cirrascale + - ai2/jupiter-cirrascale-2 \ No newline at end of file diff --git a/configs/merge_configs/70b-soup.yaml b/configs/merge_configs/70b-soup.yaml new file mode 100644 index 000000000..6ba42cf09 --- /dev/null +++ b/configs/merge_configs/70b-soup.yaml @@ -0,0 +1,18 @@ +merge_method: linear +normalize: true +models: + # - name: L3.1-70B-v3.9-nc-2e-6-2_ep-fixed + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 + - name: L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-2 + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-2/ + wekaBucket: "oe-adapt-default" + weight: 1.0 + - name: L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-3 + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-3/ + wekaBucket: "oe-adapt-default" + weight: 1.0 \ No newline at end of file diff --git a/configs/merge_configs/base_configs/default_linear_merge.yaml b/configs/merge_configs/base_configs/default_linear_merge.yaml new file mode 100644 index 000000000..3557bc440 --- /dev/null +++ b/configs/merge_configs/base_configs/default_linear_merge.yaml @@ -0,0 +1,10 @@ +models: + - model: /model-one + parameters: + weight: 1.0 + - model: /model-two + parameters: + weight: 1.0 +normalize: true +merge_method: linear +dtype: bfloat16 \ No newline at end of file diff --git a/configs/merge_configs/base_configs/default_task_arithmetic_merge.yaml b/configs/merge_configs/base_configs/default_task_arithmetic_merge.yaml new file mode 100644 index 000000000..e2f55159a --- /dev/null +++ b/configs/merge_configs/base_configs/default_task_arithmetic_merge.yaml @@ -0,0 +1,14 @@ +models: + # no parameters necessary for base model + - model: /base-model + - model: /model-one + parameters: + weight: 0.70 + normalize: False + - model: /model-2 + parameters: + weight: 0.30 + normalize: False +merge_method: task_arithmetic +base_model: /base-model +dtype: bfloat16 \ No newline at end of file diff --git a/configs/merge_configs/example_linear_merge_config.yaml b/configs/merge_configs/example_linear_merge_config.yaml new file mode 100644 index 000000000..8f902c28f --- /dev/null +++ b/configs/merge_configs/example_linear_merge_config.yaml @@ -0,0 +1,11 @@ +merge_method: linear +normalize: true +models: + - name: name + location: beaker + path: jacobm/beaker-dataset + weight: 0.5 + - name: name2 + location: huggingface + path: allenai/llama-3-tulu-2 + weight: 0.5 \ No newline at end of file diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml new file mode 100644 index 000000000..3c7246a6d --- /dev/null +++ b/configs/merge_configs/my-merge-config.yaml @@ -0,0 +1,48 @@ +merge_method: linear +normalize: true +models: + # - name: llama-3.1-8b-resized + # location: huggingface + # path: ai2-adapt-dev/llama-3.1-8b-resized + # weight: 0.5 + # - name: L3.1-8B-v3.9-nc-fixed-soup-best_2 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-best_2/ + # wekaBucket: "oe-adapt-default" + # weight: 0.5 + + - name: gsm_math_if_valpy_best_overall_avg_8b_beta0.05-step_200 + location: weka + path: /oe-adapt-default/hamishi/model_checkpoints/gsm_math_if_valpy_best_overall_avg_8b_beta0.05_checkpoints/step_200/ + wekaBucket: "oe-adapt-default" + weight: 1.0 + - name: gsm_math_if_valpy_best_and_if_avg_8b_beta0.05-step_200 + location: weka + path: /oe-adapt-default/hamishi/model_checkpoints/gsm_math_if_valpy_best_and_if_avg_8b_beta0.05_checkpoints/step_200/ + wekaBucket: "oe-adapt-default" + weight: 1.0 + # - name: L3.1-8B-v3.9-nc-fixed-2 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-2/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 + # - name: L3.1-8B-v3.9-nc-fixed-3 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-3/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 + # - name: L3.1-8B-v3.9-nc-fixed-1 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-1/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 + # - name: L3.1-8B-v3.9-nc-fixed-5 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-5/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 + # - name: L3.1-8B-v3.9-nc-fixed-4 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-4/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 \ No newline at end of file diff --git a/configs/merge_configs/my-task-arithmetic-config.yaml b/configs/merge_configs/my-task-arithmetic-config.yaml new file mode 100644 index 000000000..d91e75d30 --- /dev/null +++ b/configs/merge_configs/my-task-arithmetic-config.yaml @@ -0,0 +1,22 @@ +merge_method: task_arithmetic +base_model: ai2-adapt-dev/llama-3.1-8b-resized +normalize: true +models: + # - name: deepseek-math-7b-instruct + # location: huggingface + # path: deepseek-ai/deepseek-math-7b-instruct + # weight: 0.5 + # - name: deepseek-coder-7b-instruct-v1.5 + # location: huggingface + # path: deepseek-ai/deepseek-coder-7b-instruct-v1.5 + # weight: 0.5 + - name: L3.1-8B-v3.8-nc-final + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-nc-final/ + wekaBucket: "oe-adapt-default" + weight: 1.0 + - name: L3.1-8B-v3.8-math_subset + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-math_subset/ + wekaBucket: "oe-adapt-default" + weight: 0.46 \ No newline at end of file diff --git a/configs/train_configs/dpo/my-test-dpo.yaml b/configs/train_configs/dpo/my-test-dpo.yaml new file mode 100644 index 000000000..543b50c53 --- /dev/null +++ b/configs/train_configs/dpo/my-test-dpo.yaml @@ -0,0 +1,31 @@ +model_name_or_path: /model +tokenizer_name: /model +model_revision: main +use_flash_attn: true +gradient_checkpointing: true +# dataset_name: ai2-adapt-dev/tulu3.4-sft-replica-50k +# dataset_config_name: gpt4-prefs-on-policy +dataset_mixer: + ai2-adapt-dev/tulu3.4-sft-replica-50k-gpt4-prefs-on-policy: 1.0 + ai2-adapt-dev/personahub_if_pref_data_manualseed_v2_19890: 1.0 + ai2-adapt-dev/helpsteer2-uf-pipeline-regen: 1.0 + allenai/ultrafeedback_binarized_cleaned_train: 1.0 +use_slow_tokenizer: true +max_seq_length: 2048 +preprocessing_num_workers: 16 +per_device_train_batch_size: 1 +gradient_accumulation_steps: 16 # designed for 8 GPUs, so batch size 128 +learning_rate: 5.0e-7 +lr_scheduler_type: linear +warmup_ratio: 0.1 +weight_decay: 0.0 +num_train_epochs: 1 +output_dir: /output +with_tracking: true +report_to: + - wandb +logging_steps: 1 +use_lora: false +dpo_loss_type: dpo_norm +dpo_beta: 5 +checkpointing_steps: 1000 \ No newline at end of file diff --git a/configs/train_configs/dpo/olmoe_dpo_test.yaml b/configs/train_configs/dpo/olmoe_dpo_test.yaml new file mode 100644 index 000000000..bed7f3037 --- /dev/null +++ b/configs/train_configs/dpo/olmoe_dpo_test.yaml @@ -0,0 +1,37 @@ +model_name_or_path: /model +tokenizer_name: /model +model_revision: main +use_flash_attn: true +gradient_checkpointing: true +dataset_mixer: + # ai2-adapt-dev/sft_v3.9_used_off_policy: 1.0 + # ai2-adapt-dev/sft_v3.9_used_on_policy_large_70b_ckpt: 1.0 + # ai2-adapt-dev/DaringAnteater-prefs-RM-filter-uf-pipeline-regen-v3.9_large_70b_ckpt: 1.0 + # ai2-adapt-dev/WildChat-prefs-280824-uf-pipeline-regen-v3.9_large_70b_ckpt: 1.0 + # ai2-adapt-dev/Llama-3.1-if_taxonomy_tulu-uf-pipeline-regen-v3.9_large_70b_ckpt: 1.0 + ai2-adapt-dev/wildchat_v3.9_unused_off_policy: 1.0 + + ai2-adapt-dev/sft_v3.9_used_p0_olmoe-1b-7b: 1.0 + ai2-adapt-dev/sft_v3.9_used_p1_olmoe-1b-7b: 1.0 + ai2-adapt-dev/daring_anteater_olmoe-1b-7b: 1.0 + ai2-adapt-dev/wildchat-prefs-280824_olmoe-1b-7b: 1.0 + ai2-adapt-dev/llama3.1-if_taxonomy_tulu_olmoe-1b-7b: 1.0 +use_slow_tokenizer: true +max_seq_length: 2048 +preprocessing_num_workers: 16 +per_device_train_batch_size: 2 +gradient_accumulation_steps: 8 # designed for 8 GPUs, so batch size 128 +learning_rate: 5.0e-7 +lr_scheduler_type: linear +warmup_ratio: 0.1 +weight_decay: 0.0 +num_train_epochs: 1 +output_dir: /output +with_tracking: true +report_to: + - wandb +logging_steps: 1 +use_lora: false +dpo_loss_type: dpo_norm +dpo_beta: 5 +checkpointing_steps: 1000 \ No newline at end of file diff --git a/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml b/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml new file mode 100644 index 000000000..9fe931118 --- /dev/null +++ b/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml @@ -0,0 +1,56 @@ +model_name_or_path: allenai/open_instruct_dev +model_revision: peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-olmo_1124 +use_flash_attn: true +tokenizer_name: allenai/open_instruct_dev +use_slow_tokenizer: true +dataset_mixer: + # Static v3.9 huggingface dataset + # allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + + # General datasets: + ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + + # Math datasets: + ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all + ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + + # Coding datasets: + ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + + # IF datasets: + ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + + # Safety datasets: + # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + + # Specialty datasets: + ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + +max_seq_length: 4096 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes +learning_rate: 5.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ diff --git a/configs/train_configs/sft/olmoe_v3.9.yaml b/configs/train_configs/sft/olmoe_v3.9.yaml new file mode 100644 index 000000000..c4b61014c --- /dev/null +++ b/configs/train_configs/sft/olmoe_v3.9.yaml @@ -0,0 +1,52 @@ +model_name_or_path: allenai/OLMoE-1B-7B-0924 +model_revision: main +tokenizer_name: allenai/OLMoE-1B-7B-0924 +use_slow_tokenizer: true +dataset_mixer: + allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + + # # General datasets: + # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + + # # Math datasets: + # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all + # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + + # # Coding datasets: + # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + + # # IF datasets: + # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + + # # Specialty datasets: + # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 +max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3 +preprocessing_num_workers: 128 +per_device_train_batch_size: 2 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 1 # effective batch size 128 with 4 nodes +learning_rate: 2.0e-05 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +dataset_mix_dir: /output/ +checkpointing_steps: epoch +# keep_last_n_checkpoints: 1 +# load_balancing_loss: false # TODO: set to false +# load_balancing_weight: 0.5 +add_bos: true \ No newline at end of file diff --git a/configs/train_configs/sft/peteish_1124_preview_mix_v3.9.yaml b/configs/train_configs/sft/peteish_1124_preview_mix_v3.9.yaml new file mode 100644 index 000000000..e56ed1398 --- /dev/null +++ b/configs/train_configs/sft/peteish_1124_preview_mix_v3.9.yaml @@ -0,0 +1,59 @@ +model_name_or_path: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan/step11931-hf +model_revision: main +use_flash_attn: true +tokenizer_name: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan/step11931-hf +use_slow_tokenizer: true +dataset_mixer: + # Static v3.9 nc mix file + # WIP + + # Static v3.9 huggingface dataset + allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + + # # General datasets: + # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + + # # Math datasets: + # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 # 49980 # all + # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + + # # Coding datasets: + # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + + # # IF datasets: + # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + + # # Safety datasets: + # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + + # # Specialty datasets: + # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + +max_seq_length: 4096 # need to increase to 8k +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes +learning_rate: 2.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ diff --git a/configs/train_configs/sft/qwen2.5_7b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/qwen2.5_7b_preview_mix_v3.9-noncommercial.yaml new file mode 100644 index 000000000..c5c8b488d --- /dev/null +++ b/configs/train_configs/sft/qwen2.5_7b_preview_mix_v3.9-noncommercial.yaml @@ -0,0 +1,56 @@ +model_name_or_path: Qwen/Qwen2.5-Math-7B +model_revision: main +use_flash_attn: true +tokenizer_name: Qwen/Qwen2.5-Math-7B +use_slow_tokenizer: true +dataset_mixer: + # Static v3.9 huggingface dataset + allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + + # # General datasets: + # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + + # # Math datasets: + # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all + # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + + # # Coding datasets: + # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + + # # IF datasets: + # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + + # # Safety datasets: + # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + + # # Specialty datasets: + # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + +max_seq_length: 4096 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 4 # effective batch size 128 with 8 nodes +learning_rate: 5.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ diff --git a/configs/train_configs/sft/train-math-only-model.yaml b/configs/train_configs/sft/train-math-only-model.yaml new file mode 100644 index 000000000..86a600f9d --- /dev/null +++ b/configs/train_configs/sft/train-math-only-model.yaml @@ -0,0 +1,179 @@ +model_name_or_path: meta-llama/Meta-Llama-3-8B +model_revision: main +use_flash_attn: true +tokenizer_name: meta-llama/Meta-Llama-3-8B +use_slow_tokenizer: true +dataset_mixer: + # ------------------------------------------------------ + # no_robot dataset, human written, for general chat. + # Total: 9500 + # Pro: created by scale ai with high cost, should be high quality. + # Con: small, not diverse enough, may not be in consistent style. + HuggingFaceH4/no_robots: 9500 + # ------------------------------------------------------ + # OpenAssistant dataset, human written, for general chat. + # Here, only the highest rated paths are extracted. + # Total: 7708 + # Pro: created and reviewed by human volunteers, has multi-turn chat. + # Con: small, still has some noise, the writting quality may not be as good/careful as paid workers, style consistency. + # TODO: need to check if this version corresponds to the highest rated paths. + allenai/openassistant-guanaco-reformatted: 7708 + # ------------------------------------------------------ + # LIMA dataset, human written, for general chat. + # Some instances were filtered in building Tulu 2, probably due to some identity keywords. + # Total: 1018 + # Pro: created by researchers at Meta, aiming for diversity and high quality. + # Con: small, they were created quite early so might not consider some of the latest answering styles of chatbot. + # natolambert/tulu-v2-sft-mixture-lima: 1018 + # ------------------------------------------------------ + # Aya dataset, human written, for general chat (multilingual). + # Total: 202362 + # Pro: created by ..., aiming for very diverse languages (). + # Con: answers may not be in the perfect style. + # ai2-adapt-dev/aya_dataset-reformat: 202362 + # ------------------------------------------------------ + # Tulu hard-coded examples, human written, for identity-related questions. + # Total: 14 + # Pro: necessary to make Tulu aware of itself and its builders. + # Con: small, low coverage of possible questions from users. + # TODO: we should later find ways to replicate this multiple times. + ai2-adapt-dev/tulu_hard_coded_examples: 14 + # ------------------------------------------------------ + # CoT subset in FLAN v2, human (researchers) converted from existing datasets, for reasoning. + # Here, we use the subset processed in Tulu v2. + # Total: 48747 + # Pro: researchers converted from 9 chain-of-thought datasets about arithmetics, multi-hop reasoning, nli. + # Con: limited in the task type, written early, may have inconsistent styles compared to today's chatbot. + # natolambert/tulu-v2-sft-mixture-cot: 49747 + # ------------------------------------------------------ + # SciIFF dataset, human (researchers) converted from existing datasets, for scientific literature understanding. + # Here, we use the subset extracted by the author in building allenai/SciRIFF-train-mix. + # Total: 35357 + # Pro: researchers converted from existing datasets for 54 scientific literature understanding tasks + # Con: limited in the task type, may have inconsistent styles compared to today's chatbot. + # TODO: need to ablate and compare with the one in tulu 2 mixture natolambert/tulu-v2-sft-mixture-science + # natolambert/tulu-v2-sft-mixture-science: 7468 # original data slightly different + # ai2-adapt-dev/SciRIFF-train-mix-science: 10000 + # ------------------------------------------------------ + # SlimOrca dataset, gpt4 generated, for general chat. + # Total: 517982 + # Pro: Paring FLAN v2 inputs with system prompts, and regenerating the outputs using GPT4, potentially in a better style. + # Con: GPT4 responses may contain errors, which may be mitagated by the filtering in SlimOrca + # TODO: need to need to ablate and compare with the 300K one Faeze created. may benefit from regeneration. + # ai2-adapt-dev/slim-orca-300k: 100000 + ai2-adapt-dev/SlimOrca-reformat: 100000 + # ------------------------------------------------------ + # WizardLM eval instruct dataset, gpt4 generated, for general chat. + # Total: 196000 + # Pro: the approach deepens the complexity of gpt4-generated data + # Con: GPT4 generations have eorrs, may also inheritate the biases/styles in GPT4 + # TODO: need to ablate. + WizardLMTeam/WizardLM_evol_instruct_V2_196k: 30000 + # ------------------------------------------------------ + # WildChat dataset, real user queries + gpt4 responses, for general chat. + # Total: 254663 (1M if including those interacting with gpt 3.5) + # Pro: real user queries, may contain diverse challenging scenarios, as well as unsafe prompts. Mutli-turn. + # Con: user queries are usually not that well-formated, and contain a lot of noises. + # ai2-adapt-dev/WildChat-1M-Full-GPT4-Only: 254663 + # ------------------------------------------------------ + # ShareGPT dataset, real user shared queries + gpt4 responses, for general chat. + # Total: 114046 + # Pro: user shared queries usually contain interesting phenomena. Multi-turn. + # Con: unsure licensing, the responses were generated using earlier version of GPT4. + # TODO: need to ablate. May benefit from regeneration. + # Vtuber-plan/sharegpt-cleaned: 114046 + # ------------------------------------------------------ + # Daring-Anteater, a mix of existing datasets, for general chat. + # Total: 99532 + # Pro: a good mix of precise_instruction_following / json_format_following / complex instructions. + # Con: the constraint following part is too small. + # TODO: need to ablate if exclusing the main chat subset is helpful. + # TODO: data needs to be reformatted to consider the system prompt. + ai2-adapt-dev/Daring-Anteater-reformat: 99532 + # ------------------------------------------------------ + # MetaMathQA dataset, augmented using gpt4, for math capability. + # Total: 395000 + # Pro: augmented towards GSM/MATH, so good performance on these two benchmarks (probably similar questions too) + # Con: may be too targeted for the two benchmarks and fail to generalize to other math problems in different styles. + # ai2-adapt-dev/metamath-qa-reformat: 100000 + # ------------------------------------------------------ + # WebInstruct dataset, extract&rewritten using gpt4, (mainly) for math/science related questions + # Here, we are using their released subset. + # Total: 2335220 + # Pro: the generation benefits from GPT4 answering style & the correctness of grounding to web documents. + # Con: may be biased by the response styles in the three websites (MathStackExchange, ScienceStackExchange, Socratic); + # the question answering style are also not diverse enough, with different instruction constraints; + # the answer may still have some errors (10% based on the paper) + # TODO: need to ablate the effect. + # ai2-adapt-dev/WebInstructSub-reformat: 100000 + # ------------------------------------------------------ + # Codefeedback Filtered Instruction, a mix of existing dataset, for coding + # The data mix includes: + # Magicoder-OSS-Instruct + # Python code subset of ShareGPT + # Magicoder-Evol-Instruct + # Evol-Instruct-Code + # Total: 156526 + # Pro: a decent mix of existing coding prompts + # Con: curated mainly for the prompts in building the real CodeFeedback, so responses may be low quality (e.g., ShareGPT) + # TODO: change to individual dataset and ablate the effect. may benefit from regeneration. + # m-a-p/CodeFeedback-Filtered-Instruction: 156526 + # ------------------------------------------------------ + # Codefeedback dataset, a mix of existing dataset + feedback interaction generation, for coding + # Total: 66383 + # Pro: single-turn packing + interaction simulation seems to create good coding model that takes feedback in multi turn. + # Con: not sure how diverse the feedback is and how well it can generalize + # TODO: need to ablate. need to change code for downweight the intermediate responses with errors!!! + # m-a-p/Code-Feedback: 66383 + # ------------------------------------------------------ + # Table-GPT dataset, converted & synthesized, for table understanding and operations + # Total: 13222 + # Pro: a special dataset that contains 14 table related tasks for enhancing table capabilities + # Con: task types are limited. The tables may not be big enough. Reponse styles may be inconsistent. + # TODO: need to ablate. + # ai2-adapt-dev/Table-GPT-All-train: 3000 + # ------------------------------------------------------ + # Coconot dataset, generated by gpt4, for non-compliance + # Total: 11477 + # Pro: a special dataset for the a comprehenvise list of non-compliance behaviors of models. + # Con: the generated queries may only reflect simple cases. + # TODO: need to ablate. + # ai2-adapt-dev/coconot-sft-reformat: 11477 + # ------------------------------------------------------ + # NuminaMATH-TIR, extracted and generated by gpt4, for tool-integrated reasoning for math + # Total: 72441 + # Pro: generally high-quality dataset with mined prompts from web corpus, verified tool-integrated reasoning trajatories. + # Con: mainly for solving math in a specific format, not in a consistent format with the general chat. + # TODO: need to ablate. need to rewrite!!! + AI-MO/NuminaMath-TIR: 72441 + # AI-MO/NuminaMath-CoT: 859000 + # ------------------------------------------------------ + # Xlam function calling dataset, synthesized and verified, for tool use + # Total: 60000 + # Pro: a special dataset for enhancing function calling capability, good performance on BFCL + # Con: responses only contain the function calling and arguments, not in a consistent style with the general chat. + # TODO: need to ablate. need to rewrite!!! + # Salesforce/xlam-function-calling-60k: 60000 + # ------------------------------------------------------ + # Lmsys chatbot arena data, human queries for challenging models, for general chat. + # Total: 1000000 + # Pro: real human interaction with model, with reasonable challenges. + # Con: may not reflect the real challenges in actual use of AI models. The interactions include those with weak models. + # TODO: need to ablate. need to regenerate (the last step)!! the intermediate low-quality responese need to downweight. + # lmsys/lmsys-chat-1m: 1000000 +max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes +learning_rate: 5.0e-06 # best LR so far +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ \ No newline at end of file diff --git a/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml index 44ab2b8ce..140ed2bb6 100644 --- a/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml +++ b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml @@ -55,4 +55,4 @@ logging_steps: 1 dataset_mix_dir: /output/ checkpointing_steps: 1000 keep_last_n_checkpoints: 1 -dataset_mix_dir: /output/ \ No newline at end of file +dataset_mix_dir: /output/ diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml new file mode 100644 index 000000000..22a574f4f --- /dev/null +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml @@ -0,0 +1,70 @@ +model_name_or_path: meta-llama/Llama-3.1-8B +model_revision: main +use_flash_attn: true +tokenizer_name: meta-llama/Llama-3.1-8B +use_slow_tokenizer: true +dataset_mixer: + # Static v3.8 nc mix file + # /oe-adapt-default/jacobm/tulu-3-dev/data/tulu_v3.8_preview_nc.jsonl: 1.0 + allenai/tulu-v.3.8-mix-preview-noncommercial: 1.0 + + + # # # General datasets: + # ai2-adapt-dev/oasst1_converted: 7132 # all + # ai2-adapt-dev/flan_v2_converted: 89982 # all + # ai2-adapt-dev/tulu_hard_coded_repeated_10: 240 # all + # ai2-adapt-dev/no_robots_converted: 9500 # all + # ai2-adapt-dev/wildchat_gpt4_converted: 100000 + + # # # Math datasets: + # ai2-adapt-dev/personahub_math_v5_regen_149960: 149960 # all + # ai2-adapt-dev/personahub_grade_math_v1_49980: 49980 # all + # ai2-adapt-dev/open_math_2_gsm8k_converted: 50000 + # AI-MO/NuminaMath-TIR: 72441 # all + # ai2-adapt-dev/personahub_math_interm_algebra_50000: 20000 + + # # Coding datasets: + # ai2-adapt-dev/personahub_code_v2_34999: 34999 # all + # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 107276 # all + + # # IF datasets: + # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 29980 # all + + # # Safety datasets: + # ai2-adapt-dev/coconot_converted: 10983 # all + # ai2-adapt-dev/processed-wildjailbreak: 50000 + # ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000 + + # # Specialty datasets: + # ai2-adapt-dev/sciriff_converted: 10000 + # ai2-adapt-dev/table_gpt_converted: 5000 + # ai2-adapt-dev/aya_dataset_converted: 100000 + + # # need to split for preferences: + # ai2-adapt-dev/wildchat_gpt4_converted: 100000 + # # ai2-adapt-dev/tulu_v3.8_unused_wildchat_prompts + # # ai2-adapt-dev/tulu_v3.8_unused_wildchat_conversations + # ai2-adapt-dev/open_math_2_gsm8k_converted: 50000 + # ai2-adapt-dev/personahub_math_interm_algebra_50000: 20000 + # ai2-adapt-dev/processed-wildjailbreak: 50000 + # ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000 + # ai2-adapt-dev/sciriff_converted: 10000 + # ai2-adapt-dev/table_gpt_converted: 5000 + # ai2-adapt-dev/aya_dataset_converted: 100000 + +max_seq_length: 4096 # need to increase to 8k +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 2 # effective batch size 128 with 1 node +learning_rate: 5.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml index 834dc99ff..fd1f81ee2 100644 --- a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml @@ -41,7 +41,7 @@ dataset_mixer: max_seq_length: 4096 # need to increase to 8k preprocessing_num_workers: 128 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs -gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes +gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes learning_rate: 5.0e-06 lr_scheduler_type: linear warmup_ratio: 0.03 diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial-augusta.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial-augusta.yaml new file mode 100644 index 000000000..ebb18fee4 --- /dev/null +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial-augusta.yaml @@ -0,0 +1,59 @@ +model_name_or_path: /model +model_revision: main +use_flash_attn: true +tokenizer_name: /model +use_slow_tokenizer: true +dataset_mixer: + # Static v3.9 nc mix file + # WIP + + # Static v3.9 huggingface dataset + # allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + + # General datasets: + ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + + # Math datasets: + ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all + ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + + # Coding datasets: + ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + + # IF datasets: + ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + + # Safety datasets: + ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + + # Specialty datasets: + ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + +max_seq_length: 4096 # need to increase to 8k +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 2 # effective batch size 128 with 4 nodes +learning_rate: 5.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml index 93dc19d98..84a523f4e 100644 --- a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml @@ -5,43 +5,43 @@ tokenizer_name: meta-llama/Llama-3.1-8B use_slow_tokenizer: true dataset_mixer: # Static v3.9 huggingface dataset - allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + # allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 - # # General datasets: - # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all - # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all - # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all - # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all - # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + # General datasets: + ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 - # # Math datasets: - # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all - # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all - # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 - # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 - # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + # Math datasets: + ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all + ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 - # # Coding datasets: - # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all - # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + # Coding datasets: + ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all - # # IF datasets: - # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + # IF datasets: + ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all - # # Safety datasets: - # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all - # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 - # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + # Safety datasets: + ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 - # # Specialty datasets: - # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 - # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 - # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + # Specialty datasets: + ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 max_seq_length: 4096 preprocessing_num_workers: 128 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs -gradient_accumulation_steps: 2 # effective batch size 128 with 8 nodes +gradient_accumulation_steps: 8 # effective batch size 128 with 8 nodes learning_rate: 5.0e-06 lr_scheduler_type: linear warmup_ratio: 0.03 @@ -54,4 +54,4 @@ report_to: logging_steps: 1 checkpointing_steps: 1000 keep_last_n_checkpoints: 1 -dataset_mix_dir: /output/ \ No newline at end of file +dataset_mix_dir: /output/ diff --git a/downsampling.pdf b/downsampling.pdf new file mode 100644 index 000000000..8002b7a48 Binary files /dev/null and b/downsampling.pdf differ diff --git a/downsampling_bars.pdf b/downsampling_bars.pdf new file mode 100644 index 000000000..972ef4112 Binary files /dev/null and b/downsampling_bars.pdf differ diff --git a/oe-eval-internal b/oe-eval-internal new file mode 160000 index 000000000..4c104ac6b --- /dev/null +++ b/oe-eval-internal @@ -0,0 +1 @@ +Subproject commit 4c104ac6b4fd05d1d0f83d3d2e6a46eb77efc592 diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh index f66d250eb..5f74d76c8 100755 --- a/scripts/eval/oe-eval.sh +++ b/scripts/eval/oe-eval.sh @@ -151,5 +151,5 @@ for TASK in "${TASKS[@]}"; do GPU_COUNT=$GPU_COUNT fi - python oe-eval-internal/oe_eval/launch.py --model "$MODEL_NAME" --beaker-workspace "ai2/tulu-3-results" --beaker-budget ai2/oe-adapt --task "$TASK" $MODEL_TYPE --batch-size "$BATCH_SIZE" --model-args "{\"model_path\":\"${MODEL_LOCATION}\", \"max_length\": ${MAX_LENGTH}}" ${HF_UPLOAD_ARG} --gpus "$GPU_COUNT" --gantry-args '{"env-secret": "OPENAI_API_KEY=openai_api_key"}' ${REVISION_ARG} --beaker-retries 2 --beaker-priority "$PRIORITY" + python oe-eval-internal/oe_eval/launch.py --model "$MODEL_NAME" --beaker-workspace "ai2/tulu-3-dev" --beaker-budget ai2/oe-adapt --task "$TASK" $MODEL_TYPE --batch-size "$BATCH_SIZE" --model-args "{\"model_path\":\"${MODEL_LOCATION}\", \"max_length\": ${MAX_LENGTH}}" ${HF_UPLOAD_ARG} --gpus "$GPU_COUNT" --gantry-args '{"env-secret": "OPENAI_API_KEY=jacobm_OPENAI_API_KEY", "weka": "oe-adapt-default:/oe-adapt-default"}' ${REVISION_ARG} --beaker-retries 2 --beaker-priority "urgent" --gantry-secret-hf-write "jacobm_HF_TOKEN" done diff --git a/scripts/filter-v3.8-data.py b/scripts/filter-v3.8-data.py new file mode 100644 index 000000000..0d81a3881 --- /dev/null +++ b/scripts/filter-v3.8-data.py @@ -0,0 +1,111 @@ +from datasets import load_dataset + +full_ds = load_dataset("allenai/tulu-v.3.8-mix-preview-noncommercial") + +conversations = set() +prompts = set() + +for elem in full_ds["train"]: + conv = "" + prompt = elem["messages"][0]["content"] + prompts.add(prompt) + for msg in elem["messages"]: + conv += msg["content"] + conversations.add(conv) + + ### Not using anymore: + # ai2-adapt-dev/wildchat_gpt4_converted: 100000 + # # ai2-adapt-dev/tulu_v3.8_unused_wildchat_prompts + # # ai2-adapt-dev/tulu_v3.8_unused_wildchat_conversations + +seed = 42 + +### splitting: + +# wildchat_gpt4_converted_safety_decontaminated: 100000 +wildchat_ds = load_dataset("ai2-adapt-dev/wildchat_gpt4_converted_safety_decontaminated").shuffle(seed) +wildchat_ds_to_use = wildchat_ds["train"].select(range(100000)) +wildchat_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildchat_100k") +wildchat_ds_to_not_use = wildchat_ds["train"].select(range(100000, len(wildchat_ds["train"]))) +wildchat_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildchat_unused") + +del wildchat_ds +del wildchat_ds_to_use +del wildchat_ds_to_not_use + +# ai2-adapt-dev/open_math_2_gsm8k_converted: 50000 +openmath2_gsm8k_ds = load_dataset("ai2-adapt-dev/open_math_2_gsm8k_converted").shuffle(seed) +openmath2_gsm8k_to_use = openmath2_gsm8k_ds["train"].select(range(50000)) +openmath2_gsm8k_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k") +openmath2_gsm8k_to_not_use = openmath2_gsm8k_ds["train"].select(range(50000, len(openmath2_gsm8k_ds["train"]))) +openmath2_gsm8k_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_unused") + +del openmath2_gsm8k_ds +del openmath2_gsm8k_to_use +del openmath2_gsm8k_to_not_use + +# ai2-adapt-dev/personahub_math_interm_algebra_50000: 20000 +p_math_alg_ds = load_dataset("ai2-adapt-dev/personahub_math_interm_algebra_50000").shuffle(seed) +p_math_alg_ds_to_use = p_math_alg_ds["train"].select(range(20000)) +p_math_alg_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k") +p_math_alg_ds_to_not_use = p_math_alg_ds["train"].select(range(20000, len(p_math_alg_ds["train"]))) +p_math_alg_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_unused") + +del p_math_alg_ds +del p_math_alg_ds_to_use +del p_math_alg_ds_to_not_use + +# ai2-adapt-dev/processed_wildjailbreak_safety_decontaminated: 50000 +wjb_ds = load_dataset("ai2-adapt-dev/processed_wildjailbreak_safety_decontaminated").shuffle(seed) +wjb_ds_to_use = wjb_ds["train"].select(range(50000)) +wjb_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k") +wjb_ds_to_not_use = wjb_ds["train"].select(range(50000, len(wjb_ds["train"]))) +wjb_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_unused") + +del wjb_ds +del wjb_ds_to_use +del wjb_ds_to_not_use + +# ai2-adapt-dev/synthetic_finalresp_wildguardmixtrain_safety_decontaminated: 50000 +wg_ds = load_dataset("ai2-adapt-dev/synthetic_finalresp_wildguardmixtrain_safety_decontaminated").shuffle(seed) +wg_ds_to_use = wg_ds["train"].select(range(50000)) +wg_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k") +wg_ds_to_not_use = wg_ds["train"].select(range(50000, len(wg_ds["train"]))) +wg_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_unused") + +del wg_ds +del wg_ds_to_use +del wg_ds_to_not_use + +# ai2-adapt-dev/sciriff_converted: 10000 +sciriff_ds = load_dataset("ai2-adapt-dev/sciriff_converted").shuffle(seed) +sciriff_ds_to_use = sciriff_ds["train"].select(range(10000)) +sciriff_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_sciriff_10k") +sciriff_ds_to_not_use = sciriff_ds["train"].select(range(10000, len(sciriff_ds["train"]))) +sciriff_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_sciriff_unused") + +del sciriff_ds +del sciriff_ds_to_use +del sciriff_ds_to_not_use + +# ai2-adapt-dev/table_gpt_converted: 5000 +table_gpt_ds = load_dataset("ai2-adapt-dev/table_gpt_converted").shuffle(seed) +table_gpt_ds_to_use = table_gpt_ds["train"].select(range(5000)) +table_gpt_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_table_gpt_5k") +table_gpt_ds_to_not_use = table_gpt_ds["train"].select(range(5000, len(table_gpt_ds["train"]))) +table_gpt_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_table_gpt_unused") + +del table_gpt_ds +del table_gpt_ds_to_use +del table_gpt_ds_to_not_use + +# ai2-adapt-dev/aya_dataset_converted: 100000 +aya_ds = load_dataset("ai2-adapt-dev/aya_dataset_converted").shuffle(seed) +aya_ds_to_use = aya_ds["train"].select(range(100000)) +aya_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_aya_100k") +aya_ds_to_not_use = aya_ds["train"].select(range(100000, len(aya_ds["train"]))) +aya_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_aya_unused") + +del aya_ds +del aya_ds_to_use +del aya_ds_to_not_use diff --git a/scripts/plot-downsampling.py b/scripts/plot-downsampling.py new file mode 100644 index 000000000..bc7b04b65 --- /dev/null +++ b/scripts/plot-downsampling.py @@ -0,0 +1,309 @@ +benchmark_data = { + "Avg.": { + "eval_setting": "", + "sft_5": 57.69, + "sft_10": 58.06, + "sft_25": 58.64, + "sft_50": 59.18, + "sft_75": 59.57, + "sft_full": 60.08 + }, + "MMLU": { + "eval_setting": "5 shot", + "sft_5": 64.1, + "sft_10": 63.9, + "sft_25": 63.4, + "sft_50": 62.3, + "sft_75": 62.1, + "sft_full": 62.1 + }, + "TruthfulQA": { + "eval_setting": "6 shot", + "sft_5": 51.0, + "sft_10": 50.4, + "sft_25": 49.9, + "sft_50": 48.9, + "sft_75": 46.4, + "sft_full": 46.8 + }, + "PopQA": { + "eval_setting": "15 shot", + "sft_5": 30.8, + "sft_10": 30.8, + "sft_25": 29.8, + "sft_50": 30.1, + "sft_75": 29.6, + "sft_full": 29.3 + }, + # TODO: BBH IS NOT UP TO DATE!!! + "BigBenchHard": { + "eval_setting": "3 shot, CoT", + "sft_5": 67.5, + "sft_10": 68.2, + "sft_25": 68.5, + "sft_50": 67.6, + "sft_75": 69.7, + "sft_full": 68.8 + }, + "HumanEval": { + "eval_setting": "pass@10", + "sft_5": 81.5, + "sft_10": 81.5, + "sft_25": 81.4, + "sft_50": 84.4, + "sft_75": 86.7, + "sft_full": 86.2 + }, + "HumanEval+": { + "eval_setting": "pass@10", + "sft_5": 76.1, + "sft_10": 77.4, + "sft_25": 75.5, + "sft_50": 78.3, + "sft_75": 79.5, + "sft_full": 81.4 + }, + "GSM8K": { + "eval_setting": "8 shot, CoT", + "sft_5": 66.0, + "sft_10": 66.3, + "sft_25": 72.1, + "sft_50": 73.8, + "sft_75": 74.4, + "sft_full": 76.2 + }, + "DROP": { + "eval_setting": "3 shot", + "sft_5": 60.7, + "sft_10": 60.7, + "sft_25": 59.4, + "sft_50": 60.7, + "sft_75": 59.9, + "sft_full": 61.3 + }, + "MATH": { + "eval_setting": "4 shot CoT, Flex", + "sft_5": 29.3, + "sft_10": 28.7, + "sft_25": 30.0, + "sft_50": 30.9, + "sft_75": 31.7, + "sft_full": 31.5 + }, + "IFEval": { + "eval_setting": "Strict", + "sft_5": 65.4, + "sft_10": 68.6, + "sft_25": 70.6, + "sft_50": 68.2, + "sft_75": 70.6, + "sft_full": 72.8 + }, + "AlpacaEval 2": { + "eval_setting": "LC % win", + "sft_5": 11.1, + "sft_10": 10.2, + "sft_25": 11.7, + "sft_50": 13.3, + "sft_75": 12.4, + "sft_full": 12.4 + }, + "Safety": { + "eval_setting": "", + "sft_5": 89.8, + "sft_10": 90.9, + "sft_25": 92.3, + "sft_50": 92.6, + "sft_75": 92.8, + "sft_full": 93.1 + } +} + +import matplotlib.pyplot as plt +import numpy as np + +# Create x-axis values (SFT percentages) +x_values = [5, 10, 25, 50, 75, 100] # 100 represents full SFT + +# # Create figure and axis with a larger size +# plt.figure(figsize=(12, 8)) + +# # Color palette for different lines +# colors = plt.cm.tab20(np.linspace(0, 1, len(benchmark_data))) + +# # Plot each benchmark +# for (benchmark, data), color in zip(benchmark_data.items(), colors): +# if benchmark != "Avg.": # Skip the average for now +# y_values = [ +# data["sft_5"], +# data["sft_10"], +# data["sft_25"], +# data["sft_50"], +# data["sft_75"], +# data["sft_full"] +# ] +# plt.plot(x_values, y_values, marker='o', label=benchmark, color=color, linewidth=2) + +# # Add the average line with higher emphasis +# avg_values = [ +# benchmark_data["Avg."]["sft_5"], +# benchmark_data["Avg."]["sft_10"], +# benchmark_data["Avg."]["sft_25"], +# benchmark_data["Avg."]["sft_50"], +# benchmark_data["Avg."]["sft_75"], +# benchmark_data["Avg."]["sft_full"] +# ] +# plt.plot(x_values, avg_values, 'k--', label='Average', linewidth=3, marker='s') + +# # Customize the plot +# plt.xlabel('SFT Training Data Size', fontsize=12) +# plt.ylabel('Performance', fontsize=12) +# plt.title('Benchmark Performance Across Different SFT Percentages', fontsize=14) +# plt.grid(True, linestyle='--', alpha=0.7) +# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10) + +# # Set x-axis ticks +# plt.xticks(x_values) + +# # Adjust layout to prevent label cutoff +# plt.tight_layout() + +# # Show the plot +# plt.show() + +# Optional: Create a second plot focusing on specific benchmarks of interest +# plt.figure(figsize=(20, 8)) + +# Define benchmarks and SFT percentages +benchmarks = [ + 'Avg.', + 'GSM8K', + 'HumanEval+', + 'Safety', + 'TruthfulQA', +] +sft_percentages = ['5%', '10%', '25%', '50%', '75%', '100%'] +# colors = ['#0A2B35', '#0fcb8c', '#105257', '#f0529c', '#838383', '#0a3235'] # One color for each percentage +colors = [ + '#FAC4DD', # 10% + '#F8ADD0', # 20% + '#F697C3', # 40% + '#F480B6', # 60% + '#F269A9', # 80% + '#F0529C', # 100% - original pink +] + +colors = [ + "#E7EEEE", # RGB(231, 238, 238) + "#CEDCDD", # RGB(206, 220, 221) + "#B7CBCC", # RGB(183, 203, 204) + "#9FB9BB", # RGB(159, 185, 187) + "#88A8AB", # RGB(136, 168, 171) + "#F0529C", # PINK + "#6E979A", # RGB(110, 151, 154) + "#588689", # RGB(88, 134, 137) + "#3F7478", # RGB(63, 116, 120) + "#105257", # RGB(16, 82, 87) + "#0A3235", # RGB(10, 50, 53) +] + +# Set up the plot +fig, ax = plt.subplots(figsize=(20, 8)) + +# Width of bars and positions +width = 0.12 +n_percentages = len(sft_percentages) + +# Create bars for each benchmark +for i, benchmark in enumerate(benchmarks): + data = benchmark_data[benchmark] + values = [ + data["sft_5"], + data["sft_10"], + data["sft_25"], + data["sft_50"], + data["sft_75"], + data["sft_full"] + ] + + # Calculate positions for this benchmark's group of bars + x = i + for j in range(n_percentages): + bar_position = x - (n_percentages-1)*width/2 + j*width + bar = ax.bar(bar_position, values[j], width, + label=sft_percentages[j] if i == 0 else "", + color=colors[j], + edgecolor="black") + + # Add value labels on top of bars + # ax.text(bar_position, values[j], f'{values[j]:.1f}', ha='center', va='bottom', fontsize=8) + +# Customize the plot +# ax.set_xlabel('Benchmarks', fontsize=14) +ax.set_ylabel('Performance', fontsize=18) +plt.tick_params(axis='y', labelsize=18) +# ax.set_title('Performance by Benchmark and SFT Percentage', fontsize=14) + +# Set x-axis ticks and labels +ax.set_xticks(range(len(benchmarks))) +ax.set_xticklabels(benchmarks, ha="center", fontsize=18) + +ax.spines[["right", "top"]].set_visible(False) + +# Add legend +# ax.legend(title='SFT Sample Size', loc='center', bbox_to_anchor=(0.885, 0.8)) + +# Add grid +# ax.grid(True, linestyle='--', alpha=0.3, axis='y') + +# Adjust layout to accommodate legend +# plt.subplots_adjust(right=0.85) + +# Save and show the plot +plt.savefig('downsampling_bars.pdf', bbox_inches='tight', dpi=300) +plt.show() + +# # Define specific benchmarks and their colors +# plot_config = { +# 'Avg.': '#0a3235', # Black for average +# 'TruthfulQA': '#b11bE8', # Coral red +# 'HumanEval+': '#f0529c', # Turquoise +# 'Safety': '#105257', # Light blue +# 'GSM8K': '#0fcb8c' # Sage green +# } + +# # Plot each benchmark with its specified color +# for benchmark, color in plot_config.items(): +# data = benchmark_data[benchmark] +# y_values = [ +# data["sft_5"], +# data["sft_10"], +# data["sft_25"], +# data["sft_50"], +# data["sft_75"], +# data["sft_full"] +# ] +# # Make average line dashed and thicker +# if benchmark == 'Avg.': +# plt.plot(x_values, y_values, '--', marker='s', label=benchmark, +# color=color, linewidth=3) +# else: +# plt.plot(x_values, y_values, marker='o', label=benchmark, +# color=color, linewidth=2) + +# # Customize the focused plot +# plt.xlabel('SFT Percentage', fontsize=12) +# plt.ylabel('Performance', fontsize=12) +# # plt.title('Selected Benchmark Performance Trends', fontsize=14) +# plt.grid(True, linestyle='--', alpha=0.7) +# plt.legend(fontsize=10) +# plt.xticks(x_values) + +# # Adjust layout +# plt.tight_layout() + +# # Show the plot +# # plt.show() + +# plt.savefig('downsampling.pdf', bbox_inches='tight', dpi=300) +# plt.close() \ No newline at end of file diff --git a/scripts/plot-versions-sft.py b/scripts/plot-versions-sft.py new file mode 100644 index 000000000..7947a1946 --- /dev/null +++ b/scripts/plot-versions-sft.py @@ -0,0 +1,177 @@ +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +# Create dictionary with all models +data = { + # "L3.1-8B-v3.9-nc-fixed-2-pif_uf_hs_dpo___model__42__1730613882": { + # "Rank": 1, "Average": 62.42, "alpaca_eval": 28.6, "BBH": 68.9, + # "codex_humaneval": 85.1, "codex_humanevalplus": 81.4, "drop": 61.3, + # "GSM8K": 82.3, "IFEval": 78.4, "MATH": 41.2, + # "mmlu:cot::summarize": float('nan'), "MMLU": 63.1, + # "Safety": 76.5, "popqa": 29.1, "truthfulqa": 54.9 + # }, + # "fae_dpo_on_L3.1-8B-v3.9-nc-fixed-2_add_shp___model__42__1730847906": { + # "Rank": 2, "Average": 61.01, "alpaca_eval": 27.1, "BBH": 65.0, + # "codex_humaneval": 83.9, "codex_humanevalplus": 78.2, "drop": 58.0, + # "GSM8K": 82.7, "IFEval": 78.6, "MATH": 41.9, + # "mmlu:cot::summarize": float('nan'), "MMLU": 64.8, + # "Safety": 76.1, "popqa": 29.1, "truthfulqa": 48.5 + # }, + "Tülu v3.7": { + "Rank": 3, "Average": 60.48, "alpaca_eval": 13.7, "BBH": 67.8, + "codex_humaneval": 87.2, "codex_humanevalplus": 83.6, "drop": 60.6, + "GSM8K": 75.1, "IFEval": 72.5, "MATH": 32.6, + "mmlu:cot::summarize": 65.1, "MMLU": 63.8, + "Safety": 94.7, "popqa": 29.4, "truthfulqa": 44.7 + }, + "Tülu v3.8": { + "Rank": 4, "Average": 60.12, "alpaca_eval": 12.0, "BBH": 67.9, + "codex_humaneval": 85.8, "codex_humanevalplus": 81.1, "drop": 60.4, + "GSM8K": 77.2, "IFEval": 72.1, "MATH": 32.5, + "mmlu:cot::summarize": 65.3, "MMLU": 63.2, + "Safety": 93.5, "popqa": 29.3, "truthfulqa": 46.5 + }, + "Tülu v3.9": { + "Rank": 5, "Average": 60.08, "alpaca_eval": 12.4, "BBH": 67.9, + "codex_humaneval": 86.2, "codex_humanevalplus": 81.4, "drop": 61.3, + "GSM8K": 76.2, "IFEval": 72.8, "MATH": 31.5, + "mmlu:cot::summarize": float('nan'), "MMLU": 62.1, + "Safety": 93.1, "popqa": 29.3, "truthfulqa": 46.8 + }, + "Tülu v3.4": { + "Rank": 6, "Average": 56.79, "alpaca_eval": 11.4, "BBH": 65.3, + "codex_humaneval": 86.2, "codex_humanevalplus": 78.3, "drop": 55.8, + "GSM8K": 76.3, "IFEval": 52.9, "MATH": 25.5, + "mmlu:cot::summarize": 62.0, "MMLU": 64.8, + "Safety": 89.6, "popqa": 23.5, "truthfulqa": 51.9 + }, + "Tülu v3.1": { + "Rank": 7, "Average": 55.46, "alpaca_eval": 10.5, "BBH": 64.6, + "codex_humaneval": 83.8, "codex_humanevalplus": 80.8, "drop": 64.7, + "GSM8K": 74.5, "IFEval": 52.5, "MATH": 19.5, + "mmlu:cot::summarize": 63.7, "MMLU": 64.6, + "Safety": 70.3, "popqa": 31.4, "truthfulqa": 48.3 + }, + "Tülu v3.0": { + "Rank": 8, "Average": 55.18, "alpaca_eval": 11.3, "BBH": 63.3, + "codex_humaneval": 85.4, "codex_humanevalplus": 81.2, "drop": 62.5, + "GSM8K": 72.9, "IFEval": 48.8, "MATH": 24.2, + "mmlu:cot::summarize": 62.8, "MMLU": 65.1, + "Safety": 68.0, "popqa": 31.2, "truthfulqa": 48.2 + }, + # "Tülu v3.2": { + # "Rank": 9, "Average": 55.05, "alpaca_eval": 12.1, "BBH": 66.5, + # "codex_humaneval": 84.2, "codex_humanevalplus": 79.7, "drop": 63.1, + # "GSM8K": 73.1, "IFEval": 49.7, "MATH": 19.0, + # "mmlu:cot::summarize": 63.7, "MMLU": 64.1, + # "Safety": 68.9, "popqa": 31.6, "truthfulqa": 49.2 + # }, + # "hf-llama-3-tulu-2-dpo-8b": { + # "Rank": 10, "Average": 49.49, "alpaca_eval": 14.1, "BBH": 57.3, + # "codex_humaneval": 69.2, "codex_humanevalplus": 67.7, "drop": 58.3, + # "GSM8K": 63.6, "IFEval": 48.8, "MATH": 13.5, + # "mmlu:cot::summarize": float('nan'), "MMLU": 61.8, + # "Safety": 57.9, "popqa": 24.6, "truthfulqa": 59.8 + # }, + "Tülu v2.0": { + "Rank": 11, "Average": 48.30, "alpaca_eval": 8.9, "BBH": 57.1, + "codex_humaneval": 66.9, "codex_humanevalplus": 63.1, "drop": 61.7, + "GSM8K": 60.4, "IFEval": 42.3, "MATH": 14.0, + "mmlu:cot::summarize": float('nan'), "MMLU": 61.8, + "Safety": 70.7, "popqa": 23.3, "truthfulqa": 49.4 + } +} + +# Replace this dictionary with your preferred hex colors for each model +colors = { + "Tülu v2.0": "#F7C8E2", + "Tülu v3.0": "#E7EEEE", # RGB(231, 238, 238) + "Tülu v3.1": "#CEDCDD", # RGB(206, 220, 221) + "Tülu v3.4": "#9FB9BB", + "Tülu v3.7": "#88A8AB", + # "Tülu v3.2": "#000080", + "Tülu v3.8": "#6E979A", + # "Tülu v3.7": "#588689", + # "Tülu v3.8": "#3F7478", + "Tülu v3.9": "#F0529C", + "fae_dpo_on_L3.1-8B-v3.9-nc-fixed-2_add_shp___model__42__1730847906": "#00FF00", + "L3.1-8B-v3.9-nc-fixed-2-pif_uf_hs_dpo___model__42__1730613882": "#FF0000", + "hf-llama-3-tulu-2-dpo-8b": "#808000", +} + + # "#B7CBCC", # RGB(183, 203, 204) + # "#9FB9BB", # RGB(159, 185, 187) + # "#88A8AB", # RGB(136, 168, 171) + # "#6E979A", # RGB(110, 151, 154) + # "#588689", # RGB(88, 134, 137) + # "#3F7478", # RGB( + # ) + # "#105257", # RGB(16, 82, 87) + # "#0A3235", # RGB(10, 50, 53) + # "#F0529C", # PINK + +# Convert dictionary to DataFrame +df = pd.DataFrame.from_dict(data, orient='index') + +# Get metrics (excluding Rank and Average) +# metrics = [col for col in df.columns if col not in ['Rank']] + +metrics = [ + "Average", + "BBH", + "GSM8K", + "IFEval", + "MATH", + "MMLU", + "Safety", +] + +# Set up the plot +fig, ax = plt.subplots(figsize=(15, 8)) + +# Create the grouped bar chart +# plt.figure(figsize=(20, 10)) + +# Set the width of each bar and positions of the bars +width = 0.08 # Reduced width to accommodate more bars +x = np.arange(len(metrics)) + +# Create bars for each model +for i, (model, model_data) in enumerate(sorted(df.iterrows())): + plt.bar(x + i*width, + model_data[metrics], + width, + label=model.split('___')[0] if '___' in model else model, + color=colors[model], + edgecolor="black") + +# Customize the plot +# plt.xlabel('Metrics', fontsize=12) +# plt.ylabel('Score', fontsize=12) +# plt.title('Model Performance Comparison Across Different Metrics', fontsize=14) + +# Customize the plot +# ax.set_xlabel('Benchmarks', fontsize=14) +ax.set_ylabel('Performance', fontsize=18) +plt.tick_params(axis='y', labelsize=18) +# ax.set_title('Performance by Benchmark and SFT Percentage', fontsize=14) + +# Set x-axis ticks and labels +ax.set_xticks(range(len(metrics))) +ax.set_xticklabels(metrics, ha="center", fontsize=18) + +ax.spines[["right", "top"]].set_visible(False) + +# Add legend + +plt.xticks(x + width * len(df)/2, metrics, ha='center') +# plt.legend(bbox_to_anchor=(0.6, 0.75), loc='upper left') +# plt.grid(True, alpha=0.3) + +# Adjust layout to prevent label cutoff +plt.tight_layout() + +# Save and show the plot +plt.savefig('tulu_version_bars.pdf', bbox_inches='tight', dpi=300) +plt.show() \ No newline at end of file diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index a0f06da44..f55f4a1c3 100755 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -76,11 +76,11 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): parser.add_argument("--beaker_image", type=str, default="nathanl/open_instruct_auto", help="If given, use this Beaker image.") parser.add_argument("--beaker_subfolder", type=str, default=None) parser.add_argument("--cluster", nargs='+', default=[ - "ai2/allennlp-cirrascale", - "ai2/general-cirrascale", - "ai2/s2-cirrascale-l40", + # "ai2/allennlp-cirrascale", + # "ai2/general-cirrascale", + # "ai2/s2-cirrascale-l40", "ai2/allennlp-elara-cirrascale", - "ai2/pluto-cirrascale", + # "ai2/pluto-cirrascale", "ai2/neptune-cirrascale", "ai2/saturn-cirrascale", "ai2/jupiter-cirrascale-2", @@ -470,9 +470,11 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", f"--model_name_or_path {model_info[1]} --hf_revision {args.hf_revision}")] task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", f"--tokenizer_name_or_path {model_info[1]}")] elif model_info[1].startswith("/"): # if it's a local model, load it from the local directory - assert nfs_available, "NFS is required for path-based models." # to be safe. + # assert nfs_available, "NFS is required for path-based models." # to be safe. task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", f"--model_name_or_path {model_info[1]}")] task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", f"--tokenizer_name_or_path {model_info[1]}")] + elif model_info[1].startswith("weka"): + task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1].split(':/')[1])] else: # if it's a beaker model, mount the beaker dataset to `/model` task_spec['datasets'][1]['source']['beaker'] = model_info[1] @@ -590,6 +592,8 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): ## model location munging: if beaker, use beaker://. If hf, just name if model_info[0].startswith("hf-"): oe_eval_cmd += f" --model-location {model_info[1]}" + elif "weka" in model_info[1]: + oe_eval_cmd += f" --model-location {model_info[1]}" else: oe_eval_cmd += f" --model-location beaker://{model_info[1]}" if args.hf_revision: @@ -639,6 +643,8 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): assert nfs_available, "NFS is required for path-based models." # to be safe. task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])] task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", "--tokenizer_name_or_path "+model_info[1])] + elif model_info[1].startswith("weka"): + task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1].split(':/')[1])] else: # if it's a beaker model, mount the beaker dataset to `/model` task_spec['datasets'][1]['source']['beaker'] = model_info[1] @@ -651,6 +657,7 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): # add gpu information. # we just assume you want to use all the gpus for one task at a time + task_spec['resources']['gpuCount'] = 8 num_gpus = task_spec['resources']['gpuCount'] task_spec["arguments"][0]+= f" --min_gpus_per_task {num_gpus}" diff --git a/scripts/submit_finetune_job.py b/scripts/submit_finetune_job.py index 7b7e7609f..9ab0330de 100644 --- a/scripts/submit_finetune_job.py +++ b/scripts/submit_finetune_job.py @@ -166,7 +166,7 @@ def parse_args(args): d['tasks'][0]['arguments'][0] = new_arguments # name and description - exp_name = f"open_instruct_finetune_{model_name}_{now}" + exp_name = f"open_instruct_finetune_{model_name}_{now}"[:128] d['description'] = exp_name d['tasks'][0]['name'] = exp_name @@ -199,6 +199,9 @@ def parse_args(args): }, ] + if "google" in args.cluster: + d["tasks"][0]["datasets"].pop(0) + # WANDB settings for env in d['tasks'][0]['envVars']: if env['name'] == "WANDB_DISABLED": diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py new file mode 100644 index 000000000..061dc03f8 --- /dev/null +++ b/scripts/submit_merge_job.py @@ -0,0 +1,124 @@ +import copy +import subprocess +import yaml +import re +import itertools +from datetime import date +import argparse +import os + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--workspace", type=str, default="tulu-3-dev") + parser.add_argument("--beaker_image", type=str, default="nathanl/open_instruct_auto", help="If given, use this Beaker image.") + parser.add_argument("--beaker_config", type=str, default="configs/beaker_configs/default_merge.yaml") + parser.add_argument("--merge_config", type=str, default="configs/merge_configs/example_linear_merge_config.yaml") + parser.add_argument("--cluster", nargs='+', default=["ai2/neptune-cirrascale", "ai2/saturn-cirrascale", "ai2/jupiter-cirrascale-2"]) + parser.add_argument("--priority", type=str, default="high") + parser.add_argument("--preemptible", action="store_true", default=True, help="for using preemtipble jobs (required on some instances)") + parser.add_argument("--output_dir", type=str, default="/output") + args = parser.parse_args() + + with open(args.merge_config, 'r') as f: + default_yaml = f.read() + mergeConfig = yaml.load(default_yaml, Loader=yaml.FullLoader) + + # TODO: support SLERP + assert mergeConfig["merge_method"] in ["linear", "task_arithmetic"], f"merging method {mergeConfig['merge_method']} not supported" + + with open(f"configs/merge_configs/base_configs/default_{mergeConfig['merge_method']}_merge.yaml", 'r') as f: + merge_yaml = f.read() + baseConfig = yaml.load(merge_yaml, Loader=yaml.FullLoader) + + baseConfig["normalize"] = mergeConfig["normalize"] + baseConfig["models"] = [] + + if mergeConfig["merge_method"] == "task_arithmetic": + baseConfig["models"].append({ + "model": mergeConfig["base_model"] + }) + baseConfig["base_model"] = mergeConfig["base_model"] + + beakerDatasets = [] + wekaBuckets = set() + for elem in mergeConfig["models"]: + # - model: /model-one + # parameters: + # weight: 1.0 + + # - name: name + # location: beaker + # path: jacobm/beaker-dataset + # weight: 0.5 + if elem["location"] == "beaker": + model_data = { + "model": f"/{elem['name']}", + "parameters": {"weight": float(elem["weight"])} + } + if mergeConfig["merge_method"] == "task_arithmetic": + model_data["parameters"]["normalize"] = mergeConfig["normalize"] + # beakerConfig['datasets'][1]['source']['beaker'] = model_info[1] + # - mountPath: /hf_llama_models + # source: + # beaker: Yizhongw03/hf_llama_model_7B + beakerDatasets.append({ + "mountPath": f"/{elem['name']}", + "source": {"beaker": elem["path"]} + }) + # mount datasets + elif elem["location"] in ["huggingface", "nfs"]: + model_data = { + "model": elem['path'], + "parameters": {"weight": float(elem["weight"])} + } + if mergeConfig["merge_method"] == "task_arithmetic": + model_data["parameters"]["normalize"] = mergeConfig["normalize"] + elif elem["location"] == "weka": # verify the only available cluster(s) have weka + if elem["wekaBucket"] not in wekaBuckets: + beakerDatasets.append({ + "mountPath": f"/{elem['wekaBucket']}", + "source": {"weka": elem["wekaBucket"]} + }) + wekaBuckets.add(elem["wekaBucket"]) + model_data = { + "model": elem["path"], + "parameters": {"weight": float(elem["weight"])} + } + if mergeConfig["merge_method"] == "task_arithmetic": + model_data["parameters"]["normalize"] = mergeConfig["normalize"] + else: + print(f"Unsupported location: {elem['location']}") + baseConfig["models"].append(model_data) + + with open(args.beaker_config, 'r') as f: + beaker_yaml = f.read() + beakerConfig = yaml.load(beaker_yaml, Loader=yaml.FullLoader) + + beakerConfig['tasks'][0]['image']['beaker'] = args.beaker_image + # TODO: fix these + beakerConfig['tasks'][0]['constraints']['cluster'] = args.cluster + beakerConfig['tasks'][0]['context']['priority'] = args.priority + beakerConfig['tasks'][0]['context']['preemptible'] = args.preemptible # True required for Jupiter/Pluto + + print(beakerConfig) + + if len(beakerDatasets) > 0: + beakerConfig["tasks"][0]["datasets"] = beakerDatasets + base_command = beakerConfig["tasks"][0]["arguments"][0].replace("{OUTPUT_DIR}", args.output_dir) + beakerConfig["tasks"][0]["arguments"][0] = base_command.replace("{RAW_CONFIG}", f'"{str(baseConfig)}"') + + experiment_name = f"open_instruct_merge_models" + beakerConfig["description"] = experiment_name + # if configs/beaker_configs/auto_created doesn't exist, create it with os + if not os.path.exists("configs/beaker_configs/auto_created"): + os.makedirs("configs/beaker_configs/auto_created") + fn = "configs/beaker_configs/auto_created/{}.yaml".format(experiment_name) + os.makedirs(os.path.dirname(fn), exist_ok=True) + with open(fn, "w") as file: + yaml.dump(beakerConfig, file, default_flow_style=True) + + cmd = "beaker experiment create {} --workspace ai2/{}".format(fn, args.workspace) + subprocess.Popen(cmd, shell=True) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/table-script.py b/scripts/table-script.py new file mode 100644 index 000000000..ff199b035 --- /dev/null +++ b/scripts/table-script.py @@ -0,0 +1,331 @@ +import pandas as pd +import argparse +import sys + +""" +Examples: + +python -m plots.table-tulu3-90 --csv-path leaderboard/exported_results.csv --models ppo_ray_β_0.03__3__1730357435 Meta-Llama-3.1-8B-Instruct hf-ministral_8b_instruct_2410 hf-qwen2_5_7b_instruct valpy_dpo_70b_hslj_uflj_dalj_wciflj_iftaxlj_wcunusedlj hf-Llama-3.1-70B-Instruct hf-qwen2_5_72b_instruct + +8B +python -m plots.table-tulu3-90 --csv-path leaderboard/exported_results.csv --models Meta-Llama-3.1-8B-Instruct hf-google_gemma-2-9b-it hf-NousResearch-Hermes-3-Llama-3.1-8B hf-qwen2_5_7b_instruct hf-ministral_8b_instruct_2410 L3.18B-math-mix-final-nc__meta-llama_Llama-3.1-8B__42__1729284525 dpo_tune___model__42__1729311739 ppo_ray_β_0.03__3__1730357435 + +70B +python -m plots.table-tulu3-90 --csv-path leaderboard/exported_results.csv --models hf-Meta-Llama-3.1-70B-Instruct hf-qwen2_5_72b_instruct hf-NousResearch-Hermes-3-Llama-3.1-70B hf-llama_3_1_nemotron_70B_instruct_hf L3.1-70B-v3.8-lr_2e-6-2_epochs 70B_ppo_ray_β_0.07_lr_1e-7__3__1730258118 L3.1-70B-v3.8-lr_2e-6-2_epochs-pif_dpo-5e-7 + +Merging example: +python table-tulu3.py --csv-path ~/Downloads/exported_results_4.csv --models L3.1-8B-v3.8-nc-soup L3.1-8B-v3.9-nc-3__meta-llama_Llama-3.1-8B__456__1730332817 L3.1-8B-v3.9-nc-2__meta-llama_Llama-3.1-8B__123__1730333671 L3.1-8B-v3.9-nc__meta-llama_Llama-3.1-8B__42__1730330678 +""" + +model_label_conversion = { + # llamas + "Meta-Llama-3.1-8B-Instruct": "Llama 3.1 8B Instruct", + "hf-Llama-3.1-70B-Instruct": "Llama 3.1 70B Instruct", + "hf-Meta-Llama-3.1-70B-Instruct": "Llama 3.1 70B Instruct", + # + "hf-llama-3-tulu-2-8b": "Tulu 2 SFT", + "hf-llama-3-tulu-2-dpo-8b": "Tulu 2 + DPO", + "L3.1-8B-v3.8-nc-final__meta-llama_Llama-3.1-8B__42__1729991287": "Tulu 3 SFT", + "L3.1-8B-v3.8-wip-persona_code_v3-2-pif_dpo___model__42__1729725103": "Tulu 3 + DPO", + "ljrmvalue_lj_gsm_data_step_300": "Tulu 3 + RL", + "hf-NousResearch-Hermes-3-Llama-3.1-8B": "Hermes 3 8B", + "hf-NousResearch-Hermes-3-Llama-3.1-70B": "Hermes 3 70B", + "hf-llama_3_tulu_2_dpo_70b": "Tulu 2 + DPO 70B", + "L3.1-70B-v3.7-nc": "Tulu 3 70B SFT", + "hf-google_gemma-2-9b-it": "Gemma 2 9B", + "hf-ministral_8b_instruct_2410": "Ministral 8B", + "hf-magpielm_8b_chat_v0_1": "Magpie 8B", + "hf-gemma_2_9b_it_simpo": "Gemma 2 9B SimPO", + "L3.1-8B-v3.8-nc-soup-pif_dpo-soup": "Tulu 3 + Merging + DPO", + "L3.1-8B-v3.8-nc-soup": "Tulu 3 SFT Merge", + "L3.1-8B-v3.9-nc-3__meta-llama_Llama-3.1-8B__456__1730332817": "Seed 1", + "L3.1-8B-v3.9-nc-2__meta-llama_Llama-3.1-8B__123__1730333671": "Seed 2", + "L3.1-8B-v3.9-nc__meta-llama_Llama-3.1-8B__42__1730330678": "Seed 3", + # random SFT mixes + "fae_llama3_sftmix_v3.4_personahub_if_v1__meta-llama_Meta-Llama-3-8B__42__1728059424": "Tulu v3.4 SFT", + "sft_preview_mix_v3.5.10__meta-llama_Llama-3.1-8B__42__1729148912": "Tulu v3.6 SFT", + "L3.18B-v3.7-c__meta-llama_Llama-3.1-8B__42__1729454073": "Tulu v3.7 SFT", + "L3.1-8B-v3.8-nc-final__meta-llama_Llama-3.1-8B__42__1729991287": "Tulu v3.8 SFT", + "L3.1-8B-v3.8-nc-soup": "Tulu v3.8 SFT + Merging", + "hf-llama_3_tulu_2_70b": "Tulu 2 SFT 70B", + "L3.1-70B-v3.8-lr_2e-6-2_epochs-pif_dpo-5e-7": "Tulu 3 DPO 70B", + "L3.1-70B-v3.8-lr_2e-6-2_epochs": "Tulu 3 SFT 70B", + # 7b rivals + "hf-qwen2_5_7b_instruct": "Qwen 2.5 7B Instruct", + "hf-ministral_8b_instruct_2410": "Ministral 8B Instruct", + "hf-google_gemma-2-9b-it": "Gemma 2 9B", + "hf-gemma_2_9b_it_simpo": "Gemma 2 9B SimPO", + # 70b rivalsqw + "hf-llama_3_1_nemotron_70b_instruct_hf": "Nemotron Llama 3.1 70B", + "hf-llama_3_1_nemotron_70B_instruct_hf": "Nemotron Llama 3.1 70B", + "hf-qwen2_5_72b_instruct": "Qwen 2.5 72B", + # LMSYS version compare + "L3.18B-math-mix-final-nc__meta-llama_Llama-3.1-8B__42__1729284525": "Tulu 3 SFT", + "dpo_tune___model__42__1729311739": "Tulu 3 DPO", + "ppo_ray_β_0.03__3__1730357435": "Tulu 3 8B", + # 70b fine tunes + "L3.1-70B-v3.8-lr_2e-6-2_epochs-pif_dpo-5e-7": "Tulu 70B DPO", + "70B_ppo_ray_β_0.07_lr_1e-7__3__1730258118": "Tulu 70B RL", + "valpy_dpo_70b_hslj_uflj_dalj_wciflj_iftaxlj_wcunusedlj": "Tulu 3 70B", + "hf-NousResearch-Hermes-3-Llama-3.1-8B": "Hermes 3 8B", + "hf-llama-3-tulu-2-8b": "Tulu 2 8B SFT", + "L3.1-8B-v3.9-nc-fixed-2__meta-llama_Llama-3.1-8B__123__1730531285": "Tulu 3 8B SFT", + "hf-NousResearch-Hermes-3-Llama-3.1-70B": "Hermes 3 70B", + "hf-llama-3-tulu-2-70b": "Tulu 2 70B SFT", + "L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-3__meta-llama_Llama-3.1-70B__456__1731059165": "Tulu 3 70B SFT", + "L3.1-8B-v3.9-nc-no-safety__meta-llama_Llama-3.1-8B__42__1731562927": "Tulu 3 8B SFT w/o Safety", + "L3.1-8B-v3.9-nc-no-wc__meta-llama_Llama-3.1-8B__42__1731562946": "Tulu 3 8B SFT w/o WildChat", + "L3.1-8B-v3.9-nc-no-synthetic__meta-llama_Llama-3.1-8B__42__1731613382": "Tulu 3 8B SFT w/o Synthetic Data (ours)", + "L3.1-8B-v3.9-nc-no-math__meta-llama_Llama-3.1-8B__42__1731562937": "Tulu 3 8B SFT w/o Mathematics", + "hf-RLHFlow-LLaMA3-SFT-v2": "RLHFlow SFT V2", + "hf-MAmmoTH2-8B": "MAmmoTH2 8B", + + # downsampling + "L3.1-8B-v3.9-nc-downsample-0.05__meta-llama_Llama-3.1-8B__42__1731214637": "Tulu 3 8B SFT (5\%)", + "L3.1-8B-v3.9-nc-downsample-0.10__meta-llama_Llama-3.1-8B__42__1731214619": "Tulu 3 8B SFT (10\%)", + "L3.1-8B-v3.9-nc-downsample-0.25__meta-llama_Llama-3.1-8B__42__1731214572": "Tulu 3 8B SFT (25\%)", + "L3.1-8B-v3.9-nc-downsample-0.50__meta-llama_Llama-3.1-8B__42__1731214572": "Tulu 3 8B SFT (50\%)", + "L3.1-8B-v3.9-nc-downsample-0.75__meta-llama_Llama-3.1-8B__42__1731214576": "Tulu 3 8B SFT (75\%)", +} + +# Metric keys definition +metric_keys = { + "MMLU": "mmlu:mc::tulu", + "TruthfulQA": "truthfulqa", + "PopQA": "popqa", + "BigBenchHard": "bbh:cot::tulu", + "HumanEval": "codex_humaneval", + "HumanEval+": "codex_humanevalplus", + "GSM8K": "gsm8k", + "DROP": "drop", + "MATH": "math::flex", + "IFEval": "ifeval", + "AlpacaEval 2": "alpaca_eval", + "Safety": "overall_oe_safety_average", +} + +eval_settings = { + "MMLU": "5 shot", + "TruthfulQA": "6 shot", + "PopQA": "15 shot", + "BigBenchHard": "3 shot, CoT", + "HumanEval": "pass@10", + "HumanEval+": "pass@10", + "GSM8K": "8 shot, CoT", + "DROP": "3 shot", + "MATH": "4 shot CoT, Flex", + "IFEval": "Strict", + "AlpacaEval 2": "LC \% win", + "Safety": "", +} + +# Change this to change the table size +AVERAGE_KEYS = [ + "alpaca_eval", + "bbh:cot::tulu", + "codex_humaneval", + "codex_humanevalplus", + "drop", + "gsm8k", + "ifeval", + "math::flex", + "mmlu:mc::tulu", + "popqa", + "truthfulqa", + "overall_oe_safety_average", +] + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Create a table of model performance metrics.", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + # Required arguments + parser.add_argument( + "--csv-path", required=True, help="Path to the CSV file containing the results" + ) + parser.add_argument( + "--models", + nargs="+", + required=True, + help="List of model names to generate table for", + ) + parser.add_argument( + "--markdown", + action="store_true", + help="Output in Markdown format instead of LaTeX", + ) + parser.add_argument( + "--extra_cols", + type=int, + default=0, + help="Number of extra columns to add to the table", + ) + + return parser.parse_args() + + +def format_value(value, markdown=False): + """Format a numeric value for table output.""" + if pd.isna(value): + return "N/A" + try: + return f"{float(value):.1f}" + except: + return "N/A" + + +def create_performance_table_rows(csv_path, model_names, markdown=False, extra_cols=0): + """ + Create performance table rows for the specified models. + + Parameters: + csv_path (str): Path to the CSV file containing the results + model_names (list): List of model names to generate table for + markdown (bool): Whether to output in Markdown format + extra_cols (int): Number of extra columns to add to the table + """ + + try: + all_data = {} + df = pd.read_csv(csv_path) + rows = [] + + for model_name in model_names: + model_data = df[df["Model"] == model_name] + if len(model_data) == 0: + print(f"Warning: Model '{model_name}' not found in CSV file") + continue + + # Get pretty model name from conversion dictionary + pretty_name = model_label_conversion.get(model_name, model_name) + + # Replace "Tulu" with "\modelname" for LaTeX output only + if not markdown: + pretty_name = pretty_name.replace("Tulu ", "\\modelname~") + + all_data[pretty_name] = {} + + # Calculate average + for key in AVERAGE_KEYS: + model_data[key] = model_data[key].apply( + lambda x: float(x) if x != "nan" else None + ) + average = model_data[AVERAGE_KEYS].mean(axis=1).iloc[0] + all_data[pretty_name]["Avg."] = format_value(average, markdown) + + # add all the eval scores + for metric_name, metric_key in metric_keys.items(): + value = model_data[metric_key].iloc[0] + all_data[pretty_name][metric_name] = format_value(value, markdown) + + for metric_name in ["Avg."] + list(metric_keys.keys()): + values = [metric_name] + if metric_name == "Avg.": + values.append("") + else: + values.append(f"\\small{{{eval_settings[metric_name]}}}") + for pretty_name in all_data.keys(): + values.append(all_data[pretty_name][metric_name]) + + values = ["-1" if i == "N/A" else i for i in values] + numbers = [float(v) for v in values[2:]] + max_index = numbers.index(max(numbers)) + 2 + values[max_index] = f"\\textbf{{{values[max_index]}}}" + + if markdown: + # Markdown table row with pretty name + r = f"| | {' | '.join(values)} |" + r += " |" * extra_cols + rows.append(r) + else: + # LaTeX table row with pretty name + r = f"{' & '.join(values)}" + r += " &" * extra_cols + r += " \\\\" + rows.append(r) + if metric_name == "Avg.": + rows.append("\\midrule") + + return rows + + except FileNotFoundError: + print(f"Error: Could not find CSV file at {csv_path}") + sys.exit(1) + except pd.errors.EmptyDataError: + print(f"Error: CSV file at {csv_path} is empty") + sys.exit(1) + + +def create_latex_table(model_names, extra_cols): + """Return the LaTeX table header.""" + header = """\\begin{table}[] +\\centering +\\setlength\\tabcolsep{5pt} +\\adjustbox{max width=\\linewidth}{ +""" + column_spec = "ll" + for model_name in model_names: + if "Tulu" in model_label_conversion[model_name]: + # P is defined via \newcolumntype{P}{>{\columncolor{ai2pink}}c} + column_spec += "l" + else: + column_spec += "c" + column_spec += "c" * extra_cols + + header += ( + """\\begin{NiceTabular}{@{}""" + + column_spec + + """@{}} +\\toprule +""" + ) + header += """\\textbf{Benchmark} & \\textbf{Eval Setting}""" + for model_name in model_names: + pretty_name = model_label_conversion.get(model_name, model_name) + if "Tulu" in pretty_name: + pretty_name = pretty_name.replace("Tulu ", "\\modelname~") + pretty_name = f"\\textbf{{{pretty_name}}}" + header += " & \\rotatebox{90}{" + pretty_name + "}" + for i in range(extra_cols): + header += " & " + header += """\\\\\\midrule""" + return header + + +def create_latex_footer(): + """Return the LaTeX table footer.""" + return """\\bottomrule +\\end{NiceTabular}} +\\vspace{3pt} +\\caption{TODO} +\\label{tab:TODO} +\\end{table}""" + + +def main(): + """Main function to run the script.""" + args = parse_args() + + rows = create_performance_table_rows( + csv_path=args.csv_path, + model_names=args.models, + markdown=args.markdown, + extra_cols=args.extra_cols, + ) + + if not args.markdown: + print(create_latex_table(model_names=args.models, extra_cols=args.extra_cols)) + + for row in rows: + print(row) + + if not args.markdown: + print(create_latex_footer()) + + +if __name__ == "__main__": + main() diff --git a/tulu_version_bars.pdf b/tulu_version_bars.pdf new file mode 100644 index 000000000..2f5f0809b Binary files /dev/null and b/tulu_version_bars.pdf differ