Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Drft]Lora on dcu #9959

Open
wants to merge 27 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
741b8e7
add modleing_pp
DrownFish19 Jan 26, 2025
cf82bcc
add modleing_pp for qwen2moe
DrownFish19 Jan 27, 2025
d646dba
add flashmask and pp for Qwen2MoE and Deepseek
DrownFish19 Feb 1, 2025
3fcf2c1
remove
DrownFish19 Feb 1, 2025
3a320cb
Merge remote-tracking branch 'paddlenlp/develop' into dev_20250126_ad…
DrownFish19 Feb 1, 2025
d55f559
fix fast_tokenizer save
DrownFish19 Feb 1, 2025
b104eaa
update for topk_weight of noaux_tc
DrownFish19 Feb 6, 2025
4c7f5d6
Merge branch 'PaddlePaddle:develop' into dev_20250126_add_pipeline_fo…
DrownFish19 Feb 6, 2025
ecad2f1
fix for flashmask
DrownFish19 Feb 7, 2025
446b4da
Merge remote-tracking branch 'paddlenlp/develop' into dev_20250126_ad…
DrownFish19 Feb 7, 2025
80f5c98
add use_expert_parallel for pretrain
DrownFish19 Feb 7, 2025
47628e4
fix tokenizer test
DrownFish19 Feb 10, 2025
2651521
modify model size for dev
DrownFish19 Feb 10, 2025
035b7fe
Merge branch 'PaddlePaddle:develop' into dev_20250210_deepseek_v3_pre…
DrownFish19 Feb 11, 2025
a182411
Merge commit 'refs/pull/9838/head' of https://github.com/PaddlePaddle…
phlrain Feb 12, 2025
72bc6ee
update
phlrain Feb 12, 2025
8f3028d
mtp
DrownFish19 Feb 17, 2025
c75fd64
MTP
DrownFish19 Feb 17, 2025
175d735
update deafult config
DrownFish19 Feb 17, 2025
068e192
update MTP
DrownFish19 Feb 17, 2025
02f407f
update seq_aux_loss
DrownFish19 Feb 17, 2025
45b386e
Merge remote-tracking branch 'paddlenlp/develop' into dev_20250214_de…
DrownFish19 Feb 17, 2025
c8aa47a
Merge commit 'refs/pull/9876/head' of https://github.com/PaddlePaddle…
yongqiangma Feb 21, 2025
0c13a3c
Merge commit 'refs/pull/9849/head' of https://github.com/PaddlePaddle…
yongqiangma Feb 21, 2025
d1d8d9c
lora with use_flash_attention=false
yongqiangma Feb 27, 2025
27579e7
add config
yongqiangma Feb 28, 2025
471b172
use_flash_attention false
yongqiangma Feb 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions debug.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@


# set paddle env
export PYTHONPATH=/paddle/workspace/Paddle/build/python:/paddle/new_env/deepseek/PaddleNLP



python -u -m paddle.distributed.launch --gpus "2" run_pretrain.py ./config/deepseek-v2/pretrain_argument.json --continue_training False
6 changes: 3 additions & 3 deletions llm/config/deepseek-v2/pretrain_argument.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"model_name_or_path": "deepseek-ai/DeepSeek-V2-Lite",
"tokenizer_name_or_path": "deepseek-ai/DeepSeek-V2-Lite",
"model_name_or_path": "deepseek-ai/DeepSeek-V3",
"tokenizer_name_or_path": "deepseek-ai/DeepSeek-V3",
"input_dir": "./data",
"output_dir": "./checkpoints/pretrain_ckpts",
"per_device_train_batch_size": 1,
Expand All @@ -12,7 +12,7 @@
"sharding": "stage2",
"virtual_pp_degree": 1,
"sequence_parallel": 0,
"use_flash_attention": true,
"use_flash_attention": false,
"max_seq_length": 4096,
"learning_rate": 3e-05,
"min_learning_rate": 3e-06,
Expand Down
9 changes: 5 additions & 4 deletions llm/config/deepseek-v2/sft_argument.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"model_name_or_path": "deepseek-ai/DeepSeek-V2-Lite",
"model_name_or_path": "deepseek-ai/DeepSeek-V3",
"dataset_name_or_path": "./data",
"output_dir": "./checkpoints/sft_ckpts",
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 4,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"gradient_accumulation_steps": 1,
"per_device_eval_batch_size": 1,
"eval_accumulation_steps":1,
"num_train_epochs": 3,
"learning_rate": 3e-05,
"warmup_steps": 30,
Expand All @@ -27,6 +27,7 @@
"tensor_parallel_degree": 1,
"pipeline_parallel_degree": 1,
"sharding": "stage2",
"sharding_parallel_degree": 1,
"zero_padding": false,
"unified_checkpoint": true,
"use_flash_attention": true
Expand Down
41 changes: 41 additions & 0 deletions llm/config/deepseek-v3/lora_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"model_name_or_path": "deepseek-ai/DeepSeek-V2",
"dataset_name_or_path": "./data",
"output_dir": "./checkpoints/sft_ckpts",
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 1,
"per_device_eval_batch_size": 1,
"eval_accumulation_steps": 1,
"num_train_epochs": 3,
"learning_rate": 3e-05,
"warmup_steps": 30,
"logging_steps": 1,
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
"src_length": 1024,
"max_length": 2048,
"bf16": true,
"fp16_opt_level": "O2",
"do_train": true,
"do_eval": true,
"disable_tqdm": true,
"continue_training": false,
"pipeline_parallel_config": "enable_delay_scale_loss disable_partial_send_recv disable_batch_p2p_comm",
"tensor_parallel_config": "enable_delay_scale_loss",
"load_best_model_at_end": true,
"eval_with_do_generation": false,
"metric_for_best_model": "accuracy",
"recompute": true,
"recompute_use_reentrant": true,
"recompute_granularity": "full",
"save_total_limit": 1,
"tensor_parallel_degree": 8,
"pipeline_parallel_degree": 4,
"sharding_parallel_degree": 1,
"sharding": "stage2",
"zero_padding": true,
"flash_mask": false,
"unified_checkpoint": true,
"use_flash_attention": false,
"lora": true
}
23 changes: 23 additions & 0 deletions llm/lora.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
export PYTHONPATH=/ssd1/mayongqiang/PaddleNLP/:$PYTHONPATH
# export PATH="$HOME/.cargo/bin:$PATH"
# export PATH=/root/anaconda3/bin/:$PATH

export http_proxy=http://agent.baidu.com:8891
export https_proxy=http://agent.baidu.com:8891
export no_proxy=localhost,bj.bcebos.com,su.bcebos.com,pypi.tuna.tsinghua.edu.cn,paddle-ci.gz.bcebos.com

export PPNLP_HOME="/ssd1/mayongqiang/ppnlp_home/"

# 前期可基于lite模型验证
MODEL_TAG=deepseek-ai/DeepSeek-V2-Lite-Chat
# MODEL_TAG=deepseek-ai/DeepSeek-V2-Chat
# MODEL_TAG=deepseek-ai/DeepSeek-V3
# MODEL_TAG=deepseek-ai/DeepSeek-R1

# QUANT_MODE=
QUANT_MODE=weight_only_int8
# QUANT_MODE=weight_only_int4

# python run_finetune.py ./devices/dcu/llama/lora_argument.json
# python run_pretrain.py ./config/llama/lora_argument.json --continue_training False
python run_finetune.py ./config/deepseek-v3/lora_argument.json
7 changes: 7 additions & 0 deletions llm/run_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,13 @@ def main():
)
else:
# NOTE(gongenlei): new add autotuner_benchmark
# 修改这里降低模型层数,deepseek前3层为dense层,之后才有稀疏层
model_config.num_hidden_layers = 2 # v3是61
model_config.first_k_dense_replace = 1 # v3是3
# 修改这里降低模型专家数量,如果希望进行EP并行,专家数量要能够被并行度整除
model_config.n_routed_experts = 16 # v3是256
model_config.num_experts_per_tok = 4 # v3是8
model_config.topk_group = 2 # v3是4
model = model_class.from_config(model_config, dtype=dtype)

if model_args.flash_mask and (not data_args.zero_padding or not model.config.use_flash_attention):
Expand Down
8 changes: 8 additions & 0 deletions llm/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,14 @@ def main():
dtype=dtype,
)
else:
# 修改这里降低模型层数,deepseek前3层为dense层,之后才有稀疏层
config.num_hidden_layers = 1 # v3是61
config.first_k_dense_replace = 1 # v3是3
# 修改这里降低模型专家数量,如果希望进行EP并行,专家数量要能够被并行度整除
config.n_routed_experts = 16 # v3是256
config.num_experts_per_tok = 2 # v3是8
config.hidden_size = 128
config.topk_group = 2 # v3是4
model = model_class.from_config(config, dtype=dtype)

if training_args.recompute:
Expand Down
4 changes: 3 additions & 1 deletion paddlenlp/transformers/deepseek_v2/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ def __init__(
intermediate_size=11008,
moe_intermediate_size=1407,
num_hidden_layers=30,
num_nextn_predict_layers=1,
num_nextn_predict_layers=0,
num_nextn_predict_lambda=1.0,
num_attention_heads=32,
num_key_value_heads=32,
n_shared_experts=None,
Expand Down Expand Up @@ -187,6 +188,7 @@ def __init__(
self.moe_intermediate_size = moe_intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_nextn_predict_layers = num_nextn_predict_layers
self.num_nextn_predict_lambda = num_nextn_predict_lambda
self.num_attention_heads = num_attention_heads
self.n_shared_experts = n_shared_experts
self.n_routed_experts = n_routed_experts
Expand Down
Loading