Skip to content

[QEff. Finetuning]: Enhance test cases to match intermediate step level loss/metrics #531

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Aug 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion QEfficient/finetune/utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,6 @@ def train(
logger.log_rank_zero(
f"Epoch {epoch + 1}: Train epoch loss: {train_epoch_loss:.4f}, Train metric: {train_epoch_metric:.4f}, Epoch time {epoch_end_time:.2f} sec"
)

# Saving the results every epoch to plot later
if train_config.save_metrics:
save_to_json(
Expand All @@ -374,9 +373,14 @@ def train(

results["last_epoch_train_loss"] = train_epoch_loss.cpu()
results["last_epoch_train_metric"] = train_epoch_metric.cpu()
results["train_step_loss"] = train_step_loss
results["train_step_metric"] = train_step_metric

if train_config.run_validation:
results["last_epoch_eval_loss"] = eval_epoch_loss.cpu()
results["last_epoch_eval_metric"] = eval_epoch_metric.cpu()
results["eval_step_loss"] = eval_step_loss
results["eval_step_metric"] = eval_step_metric
results["avg_epoch_time"] = avg_epoch_time
results["avg_checkpoint_time"] = avg_checkpoint_time
if train_config.save_metrics:
Expand Down
10 changes: 10 additions & 0 deletions tests/finetune/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------

# Finetuning Test Constants
LOSS_ATOL = 1e-3
METRIC_ATOL = 1e-3
236 changes: 236 additions & 0 deletions tests/finetune/reference_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------

"""Reference data for the finetune tests from SDK version - 1.21.0.23"""

# A dictionary to hold all reference data for all test sets.
REFERENCE_DATA = {
# Scenario 1: Single-device llama 3.2-1B training on Alpaca dataset.
"llama_3.2_1B_config_alpaca_single_device": {
"description": "Baseline for Llama on Alpaca single-device",
Copy link
Contributor

@quic-swatia quic-swatia Aug 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add the complete model ID here and in other configs as well.

"train_step_losses": [
1.5112206935882568,
1.2211230993270874,
1.9942185878753662,
2.093623161315918,
0.9168124198913574,
1.2125635147094727,
0.3648962676525116,
1.6231939792633057,
0.8259601593017578,
0.7741442918777466,
1.7359141111373901,
2.118462085723877,
2.061161994934082,
0.8256913423538208,
0.8088029623031616,
1.761340618133545,
1.6828027963638306,
1.3538823127746582,
2.0672550201416016,
3.1532647609710693,
],
"eval_step_losses": [
1.462059736251831,
0.24527676403522491,
1.046107292175293,
1.6403586864471436,
1.395291805267334,
2.8664817810058594,
1.035412311553955,
1.8670039176940918,
3.8079662322998047,
0.6516809463500977,
],
"train_step_metrics": [
4.532259941101074,
3.390994071960449,
7.34645938873291,
8.114261627197266,
2.5013046264648438,
3.3620924949645996,
1.4403645992279053,
5.069255828857422,
2.2840728759765625,
2.1687355041503906,
5.674112319946289,
8.318334579467773,
7.855090141296387,
2.283458948135376,
2.2452187538146973,
5.820234775543213,
5.380615711212158,
3.872429847717285,
7.903097629547119,
23.412376403808594,
],
"eval_step_metrics": [ # steps 0-9
4.31483793258667,
1.2779749631881714,
2.8465487957000732,
5.157018661499023,
4.036152362823486,
17.575077056884766,
2.816267251968384,
6.468885898590088,
45.05870819091797,
1.9187631607055664,
],
},
# Scenario 2: Single-device llama 3.2-1B training on GSM8k dataset.
"llama_3.2_1B_config_gsm8k_single_device": {
"description": "Baseline for Llama on GSM8k single-device",
"train_step_losses": [
2.250276803970337,
2.3231687545776367,
1.9379945993423462,
1.5981022119522095,
1.9867562055587769,
1.4573354721069336,
1.8969658613204956,
1.2177824974060059,
1.6489791870117188,
1.5380687713623047,
1.4025083780288696,
1.5301083326339722,
1.6858205795288086,
1.383747935295105,
1.7968919277191162,
1.4075607061386108,
1.6447738409042358,
1.2807793617248535,
0.8450672030448914,
1.5795941352844238,
],
"eval_step_losses": [
1.7081595659255981,
1.719305157661438,
1.153528094291687,
2.0051634311676025,
1.3372926712036133,
1.3009852170944214,
1.2207027673721313,
1.3452664613723755,
1.329830288887024,
1.307450532913208,
],
"train_step_metrics": [
9.490362167358398,
10.207969665527344,
6.944809913635254,
4.943641662597656,
7.291841506958008,
4.294501304626465,
6.6656389236450195,
3.3796849250793457,
5.201667308807373,
4.655590534210205,
4.065384864807129,
4.618677139282227,
5.396877765655518,
3.989826202392578,
6.030873775482178,
4.0859761238098145,
5.179838180541992,
3.5994436740875244,
2.328134298324585,
4.852985858917236,
],
"eval_step_metrics": [ # steps 0-9
5.518795013427734,
5.580649375915527,
3.1693549156188965,
7.42730712890625,
3.8087174892425537,
3.672913074493408,
3.38956880569458,
3.8392088413238525,
3.7804012298583984,
3.6967368125915527,
],
},
# Scenario 3: Single-device google-bert/bert-base-uncased training on IMDB dataset.
"bert_base_uncased_config_imdb_single_device": {
"description": "Baseline for google-bert/bert-base-uncased on IMDB single-device",
"train_step_losses": [
0.357421875,
0.546875,
0.98486328125,
0.35302734375,
1.23828125,
0.60791015625,
0.44384765625,
0.791015625,
0.7861328125,
0.51318359375,
0.50244140625,
0.90087890625,
0.8818359375,
0.86279296875,
0.6396484375,
0.49267578125,
0.97119140625,
0.7451171875,
0.798828125,
0.7080078125,
],
"eval_step_losses": [
0.634765625,
0.8173828125,
0.9072265625,
0.7177734375,
0.59423828125,
0.69921875,
0.7109375,
0.7216796875,
0.6064453125,
0.7041015625,
],
"train_step_metrics": [
1.0,
1.0,
0.5,
0.5,
0.5,
0.5,
0.5,
0.5,
0.5,
0.5,
0.5,
0.5,
0.5,
0.5,
0.5,
0.5,
0.5,
0.5,
0.449951171875,
0.4091796875,
],
"eval_step_metrics": [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0],
},
# Scenario 4: Distributed google-bert/bert-base-uncased training (world_size=2)
"bert_base_uncased_config_imdb_distributed_ws2": {
"description": "Baseline for distributed training with 2 devices",
"world_size": 2,
"rank_data": {
0: { # Data for Rank 0
"train_step_losses": [],
"eval_step_losses": [],
"train_step_metrics": [],
"eval_step_metrics": [],
},
1: { # Data for Rank 1
"train_step_losses": [],
"eval_step_losses": [],
"train_step_metrics": [],
"eval_step_metrics": [],
},
},
},
}
Loading
Loading