Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

longlora-paddle #9939

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions llm/config/llama/longlora.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
"dataset_name_or_path": "./data",
"output_dir": "./checkpoints/lora_ckpts",
"per_device_train_batch_size": 4,
"gradient_accumulation_steps": 4,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"num_train_epochs": 1,
"learning_rate": 3e-04,
"warmup_steps": 30,
"logging_steps": 1,
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
"src_length": 1024,
"max_length": 2048,
"bf16": true,
"fp16_opt_level": "O2",
"do_train": true,
"do_eval": true,
"disable_tqdm": true,
"load_best_model_at_end": true,
"eval_with_do_generation": false,
"metric_for_best_model": "accuracy",
"recompute": true,
"save_total_limit": 1,
"tensor_parallel_degree": 1,
"pipeline_parallel_degree": 1,
"sharding": "stage1",
"lora": true,
"zero_padding": false,
"use_flash_attention": true,
"unified_checkpoint": true,
"pissa": false,
"use_mora": false
}
9 changes: 9 additions & 0 deletions llm/run_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,13 @@ def main():
quantization_config=quantization_config,
)

if training_args.use_ssa:
assert (
training_args.ssa_group_size_ratio is not None
), "ssa_group_size_ratio must be specified when use_ssa is True"
model_config.use_ssa = True
model_config.ssa_group_size_ratio = training_args.ssa_group_size_ratio

architectures_to_check = {"Qwen2Moe", "DeepseekV2", "DeepseekV3"}
if (
any(architecture in str(model_config.architectures) for architecture in architectures_to_check)
Expand All @@ -192,6 +199,8 @@ def main():
model_config.fuse_attention_ffn = model_args.fuse_attention_ffn

model_config.seq_length = data_args.max_length
orig_ctx_len = getattr(model_config, "max_position_embeddings", None)
model_args.rope_scaling_factor = data_args.max_length // orig_ctx_len

# Config for model useing long sequence strategy
if model_args.use_long_sequence_strategies:
Expand Down
2 changes: 1 addition & 1 deletion llm/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def tokenize_unsupervised_example(tokenizer, example, data_args, is_test=True, z
source,
truncation=False,
padding=True,
max_length=data_args.scaled_max_length,
max_length=data_args.src_length,
add_special_tokens=True,
)

Expand Down
4 changes: 4 additions & 0 deletions paddlenlp/transformers/llama/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ def __init__(
use_last_token_for_generation=False,
immediate_clear_past_key_value=False,
dpo_config=None,
use_ssa=False,
ssa_group_size_ratio=None,
**kwargs,
):
self.vocab_size = vocab_size
Expand Down Expand Up @@ -197,6 +199,8 @@ def __init__(
self.use_last_token_for_generation = use_last_token_for_generation
self.immediate_clear_past_key_value = immediate_clear_past_key_value
self.dpo_config = dpo_config
self.use_ssa = use_ssa
self.ssa_group_size_ratio = ssa_group_size_ratio

super().__init__(
pad_token_id=pad_token_id,
Expand Down
Loading
Loading