Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
hydra:
run:
dir: .
output_subdir: null

exp_name: "qwen2.5-0.5B-rlvr-config-temp-reward"
seed: 42
logging_dir: ./output/logs
output_dir: ./output
system_envs:
USE_MODELSCOPE: '1'

checkpoint_config:
type: file_system
output_dir: /data/cpfs_0/rl_examples/models/${exp_name}

#track_with: wandb
#tracker_kwargs:
# api_key:
# project: roll_examples
# notes: roll_examples
# tags:
# - rlvr
# - baseline

track_with: tensorboard
tracker_kwargs:
log_dir: /data/oss_bucket_0/rl_examples/llm/tensorboard/roll_exp/rlvr

num_gpus_per_node: 8

max_steps: 500
save_steps: 100
logging_steps: 1
eval_steps: 10
resume_from_checkpoint: false


rollout_batch_size: 64 # prompt
prompt_length: 2048
response_length: 4096

num_return_sequences_in_group: 8
ppo_epochs: 1
adv_estimator: "reinforce"

# clip
value_clip: 0.5
reward_clip: 10
advantage_clip: 2.0
dual_clip_loss: true

# normalize
reward_norm: null
reward_shift: false
reward_scale: false

# data mask
max_len_mask: true
difficulty_mask: true
difficulty_low_threshold: 0.1
difficulty_high_threshold: 0.95
error_max_len_clip: false

# reward-based prompt filtering
use_filtering_metric: true
filtering_metric: "mean_temporal_reward"
num_recent_reward: 2
filtering_warmup_epoch: 1
filtering_max_epoch: 10

# data weight
difficulty_loss_weight: false
length_loss_weight: false

# reward
add_token_level_kl: false

# advantage
whiten_advantages: true

# dynamic sampling scheduler
# use_additional_prompts: true
# max_running_requests: 256
# is_num_return_sequences_expand: false

pretrain: Qwen/Qwen2.5-7B
reward_pretrain: Qwen/Qwen2.5-7B

validation:
data_args:
template: qwen2_5
file_name:
- data/aime24_25_deal.jsonl
generating_args:
top_p: 0.6
top_k: 50
num_beams: 1
temperature: 0.6
num_return_sequences: 1
eval_steps: 10

actor_train:
model_args:
flash_attn: fa2
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
training_args:
learning_rate: 1.0e-6
weight_decay: 0
per_device_train_batch_size: 1
gradient_accumulation_steps: 32
warmup_steps: 20
num_train_epochs: 50
data_args:
template: qwen2_5
file_name:
- data/code_KodCode_data.jsonl
- data/llm_judge_Multi-subject-RLVR_deal_new.jsonl
- data/math_deepmath_deal.jsonl
- data/general_ifeval_train_deal.jsonl
- data/general_CrossThink-QA_deal.jsonl
domain_interleave_probs:
math_rule: 0.4
code_sandbox: 0.3
llm_judge: 0.1
crossthinkqa: 0.1
ifeval: 0.1
dataset_dir: data
messages: messages
interleave_probs: "1.0"
preprocessing_num_workers: 16
strategy_args:
strategy_name: megatron_train
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
use_distributed_optimizer: true
recompute_granularity: full
device_mapping: list(range(0,16))
infer_batch_size: 4

actor_infer:
model_args:
flash_attn: fa2
disable_gradient_checkpointing: true
dtype: bf16
generating_args:
max_new_tokens: ${response_length}
top_p: 0.99
top_k: 100
num_beams: 1
temperature: 0.99
num_return_sequences: ${num_return_sequences_in_group}
data_args:
template: qwen2_5
strategy_args:
strategy_name: vllm
strategy_config:
gpu_memory_utilization: 0.6
block_size: 16
max_model_len: 8000
device_mapping: list(range(0,12))
infer_batch_size: 1

reference:
model_args:
flash_attn: fa2
disable_gradient_checkpointing: true
dtype: bf16
model_type: ~
data_args:
template: qwen2_5
strategy_args:
strategy_name: megatron_infer
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
device_mapping: list(range(0,16))
infer_batch_size: 8

rewards:
crossthinkqa:
worker_cls: roll.pipeline.rlvr.rewards.crossthinkqa_rule_reward_worker.CrossThinkQARuleRewardWorker
reward_type: soft
response_length_penalty_coef: 0.0
model_args:
model_name_or_path: ${reward_pretrain}
data_args:
template: qwen2_5
tag_included: [crossthinkqa]
world_size: 8
infer_batch_size: 4
ifeval:
worker_cls: roll.pipeline.rlvr.rewards.ifeval_rule_reward_worker.GeneralRuleRewardWorker
reward_type: soft
model_args:
model_name_or_path: ${reward_pretrain}
data_args:
template: qwen2_5
tag_included: [ifeval]
world_size: 8
infer_batch_size: 4
math_rule:
worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
model_args:
model_name_or_path: ${reward_pretrain}
data_args:
template: qwen2_5
tag_included: [deepmath_103k, aime]
world_size: 8
infer_batch_size: 1
# dynamic filter config
# query_filter_config:
# type: mean_filter
# filter_args:
# threshold_up: 0.9
# threshold_down: 0.1
code_sandbox:
use_local: true
worker_cls: roll.pipeline.rlvr.rewards.code_sandbox_reward_worker.CodeSandboxRewardWorker
tag_included: [KodCode]
model_args:
model_name_or_path: ${reward_pretrain}
data_args:
template: qwen2_5
world_size: 8
infer_batch_size: 1
# query_filter_config:
# type: std_filter
# filter_args:
# std_threshold: 0
llm_judge:
# NOTE: llm as judge 也需要gpu, 不能和actor infer共享gpu
worker_cls: roll.pipeline.rlvr.rewards.llm_judge_reward_worker.LLMJudgeRewardWorker
judge_prompt: Qwen2.5-7B-Instruct-RLVR-prompt
judge_model_type: inference
tag_included: [RLVR]
model_args:
model_name_or_path: AI-ModelScope/Qwen2.5-7B-Instruct-RLVR
flash_attn: fa2
disable_gradient_checkpointing: true
dtype: bf16
model_type: trl
generating_args:
max_new_tokens: 100
top_p: 0.8
top_k: 50
num_beams: 1
temperature: 0.8
num_return_sequences: 1
data_args:
template: qwen2_5
strategy_args:
strategy_name: hf_infer
strategy_config: null
device_mapping: list(range(12,16))
infer_batch_size: 4
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
set +x

CONFIG_PATH=$(basename $(dirname $0))
python examples/start_rlvr_pipeline.py --config_path $CONFIG_PATH --config_name rlvr_config
5 changes: 5 additions & 0 deletions roll/pipeline/rlvr/rlvr_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,11 @@ class RLVRConfig(BaseConfig):
difficulty_high_threshold: float = field(default=1.0)
error_max_len_clip: bool = field(default=False)
error_max_len_threshold: int = field(default=9999999999)
use_filtering_metric: bool = field(default=False, metadata={"help": "Apply reward metric for prompt filtering."})
filtering_metric: str = field(default=None, metadata={"help": "Prompt filtering metric."})
num_recent_reward: int = field(default=0, metadta={"help": "Number of recent rewards applied for prompt filtering."})
filtering_warmup_epoch: int = field(default=0, metadata={"help": "Minimal epoch that starts to apply prompt filtering."})
filtering_max_epoch: int = field(default=0, metadata={"help": "Maximal epoch that applys prompt filtering."})

def __post_init__(self):
super().__post_init__()
Expand Down
Loading