Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/actions/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ runs:
run: echo "node_name=$NODE_NAME" | tee -a "$GITHUB_OUTPUT"

- name: Checkout repository
uses: actions/checkout@v2
uses: actions/checkout@v6

- name: Change ownership of /home/runner/
shell: bash
Expand Down Expand Up @@ -98,7 +98,8 @@ runs:
--environment dev \
--platform dgx_h100 \
--tag ${{ inputs.tag }} \
--container-image ${{ inputs.container-image }}
--container-image ${{ inputs.container-image }} \
--hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME

RUN_TEST_EOF
)
Expand Down Expand Up @@ -186,6 +187,7 @@ runs:
--platform dgx_h100 \
--container-image ${{ inputs.container-image }} \
--data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \
--hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME

RUN_TEST_EOF
)
Expand Down
2 changes: 1 addition & 1 deletion .github/copy-pr-bot.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
enabled: true
auto_sync_draft: false
auto_sync_ready: true
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "ksivaman", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
38 changes: 19 additions & 19 deletions .github/oncall_schedule.json
Original file line number Diff line number Diff line change
@@ -1,16 +1,4 @@
[
{
"user": "janEbert",
"date": "2026-02-18"
},
{
"user": "asolergi-nv",
"date": "2026-02-25"
},
{
"user": "BoxiangW",
"date": "2026-03-04"
},
{
"user": "maanug-nv",
"date": "2026-03-11"
Expand All @@ -20,31 +8,43 @@
"date": "2026-03-18"
},
{
"user": "gautham-kollu",
"user": "janEbert",
"date": "2026-03-25"
},
{
"user": "janEbert",
"user": "gautham-kollu",
"date": "2026-04-01"
},
{
"user": "maanug-nv",
"user": "ilml",
"date": "2026-04-08"
},
{
"user": "BoxiangW",
"user": "Phlip79",
"date": "2026-04-15"
},
{
"user": "Phlip79",
"user": "asolergi-nv",
"date": "2026-04-22"
},
{
"user": "asolergi-nv",
"user": "BoxiangW",
"date": "2026-04-29"
},
{
"user": "dimapihtar",
"user": "maanug-nv",
"date": "2026-05-06"
},
{
"user": "dimapihtar",
"date": "2026-05-13"
},
{
"user": "gautham-kollu",
"date": "2026-05-20"
},
{
"user": "ilml",
"date": "2026-05-27"
}
]
46 changes: 15 additions & 31 deletions .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,8 @@

## Contribution process

```mermaid
flowchart LR
A[Pre-checks] --> B[PR Tests]
subgraph Code Review/Approval
C1[Expert Review] --> C2[Final Review]
end
B --> C1
C2 --> D[Merge]
```

### Pre-checks

- [ ] I want this PR in a versioned release and have added the appropriate Milestone (e.g., `Core 0.8`)
- [ ] I have added relevant unit tests
- [ ] I have added relevant functional tests
- [ ] I have added proper typing to my code [Typing guidelines](https://docs.python.org/3/library/typing.html)
Expand All @@ -26,41 +15,36 @@ flowchart LR

### Code review

The following process is enforced via the CODEOWNERS file for changes into `megatron/core`. For changes outside of `megatron/core`, it is up to the PR author whether or not to tag the Final Reviewer team.
Feel free to message or comment the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) to help accelerate your merge into main. The less complex your PR is, the faster it will be approved and merged!

<details>
<summary>For MRs into `main` branch</summary>
All PRs start as **draft**. If you open a non-draft PR, it will be automatically converted to draft.

Feel free to message or comment the @mcore-oncall to help accelerate your merge into main. The less complex your PR is, the faster it will be approved and merged!
#### Step 1: Mark PR as "Ready for Review"

#### (Step 1): Add PR label `Expert Review`
1. When your PR is ready, click **Ready for Review**.
2. An oncall reviewer is auto-assigned and expert reviewers are notified based on your changes.
- Some PRs may jump straight to step 2. This is determined by `.github/CODEOWNERS`.

#### (Step 2): Collect the expert reviewers reviews
:warning: Only mark as ready once merge-conflicts are resolved and the CI is passing.
Final Review might get declined if these requirements are not fulfilled.

1. Attach the `Expert Review` label when your PR is ready for review.
2. GitHub auto-assigns expert reviewers based on your changes. They will get notified and pick up your PR soon.
#### Step 2: Final Review

:warning: Only proceed to the next step once all reviewers have approved, merge-conflict are resolved and the CI is passing.
Final Review might get declined if these requirements are not fulfilled.
For PRs that change `megatron/core`, once all expert reviewers have approved, the `Final Review` label is applied **automatically** and final reviewers are assigned.

#### (Step 3): Final Review
For PRs outside `megatron/core`, this step is skipped.

1. Add `Final Review` label
2. GitHub auto-assigns final reviewers based on your changes. They will get notified and pick up your PR soon.
#### Step 3: Approved

#### (Optional Step 4): Cherry-pick into release branch
Once all required reviewers have approved, the `Approved` label is applied **automatically**.

If this PR also needs to be merged into `core_r*` release branches, after this PR has been merged, select `Cherry-pick` to open a new PR into the release branch.
### Merge

</details>
Any member of [mcore-engineers](https://github.com/orgs/NVIDIA/teams/mcore-engineers) will be able to merge your PR.

<details>
<summary>For MRs into `dev` branch</summary>
The proposed review process for `dev` branch is under active discussion.

MRs are mergable after one approval by either `eharper@nvidia.com` or `zijiey@nvidia.com`.
</details>

### Merging your PR

Any member of [core-adlr](https://github.com/orgs/teams/NVIDIA/core-adlr) and [`core-nemo`](https://github.com/orgs/teams/NVIDIA/core-nemo) will be able to merge your PR.
6 changes: 3 additions & 3 deletions .github/workflows/_build_test_publish_wheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
PUBLISH_DRYRUN: ${{ inputs.dry-run }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
ref: ${{ inputs.ref }}

Expand Down Expand Up @@ -136,7 +136,7 @@ jobs:
test "${{ steps.build-wheel.outputs.expected-release-number }}" == "$RELEASE_NUMBER"

- name: Upload wheels
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v6
with:
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
path: dist/
Expand All @@ -159,7 +159,7 @@ jobs:
PACKAGE: ${{ matrix.PACKAGE }}
steps:
- name: Download wheels
uses: actions/download-artifact@v4
uses: actions/download-artifact@v7
with:
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
path: dist/
Expand Down
41 changes: 32 additions & 9 deletions .github/workflows/_release_library.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,34 @@ on:
description: Starting tag for changelog builder (leave empty for auto-detect)
type: string
default: ""
publish-docs:
required: false
description: Publish documentation to S3 after release
type: boolean
default: true
secrets:
TWINE_PASSWORD:
required: true
SLACK_WEBHOOK:
required: true
PAT:
required: true
AWS_ASSUME_ROLE_ARN:
required: true
AWS_ACCESS_KEY_ID:
required: true
AWS_SECRET_ACCESS_KEY:
required: true
AKAMAI_HOST:
required: true
AKAMAI_CLIENT_TOKEN:
required: true
AKAMAI_CLIENT_SECRET:
required: true
AKAMAI_ACCESS_TOKEN:
required: true
S3_BUCKET_NAME:
required: true

permissions:
contents: write # To read repository content
Expand Down Expand Up @@ -89,7 +110,7 @@ jobs:
IS_DRY_RUN: ${{ inputs.dry-run }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
token: ${{ secrets.PAT }}
Expand Down Expand Up @@ -199,11 +220,8 @@ jobs:
# Extract PR number from URL
PR_NUMBER=$(echo $PR_URL | grep -o '[0-9]*$')

# Add comment to the newly created PR
echo gh pr comment $PR_NUMBER --body "/ok to test $(git rev-parse HEAD)"

- name: Wait for status checks on tmp branch
uses: actions/github-script@v7
uses: actions/github-script@v8
id: wait-status
with:
github-token: ${{ secrets.PAT }}
Expand Down Expand Up @@ -326,7 +344,6 @@ jobs:
ref: ${{ inputs.release-ref }}
no-publish: false
secrets:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}

create-gh-release:
Expand All @@ -344,10 +361,10 @@ jobs:
REPOSITORY: ${{ github.repository }}
PROJECT_NAME: Megatron Core
VERSION: ${{ needs.bump-next-version.outputs.release-version }}
TAG_PREFIX: ${{ inputs.gh-release-tag-prefix || '' }}
TAG_PREFIX: core_
steps:
- name: Checkout repository
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
ref: ${{ inputs.release-ref }}
Expand Down Expand Up @@ -455,6 +472,12 @@ jobs:
publish-docs:
needs: [bump-next-version, create-gh-release]
uses: ./.github/workflows/release-docs.yml
if: |
(
success() || !failure()
)
&& inputs.publish-docs == true
&& !cancelled()
with:
dry-run: ${{ inputs.dry-run }}
publish-as-latest: true
Expand All @@ -472,7 +495,7 @@ jobs:
VERSION: ${{ needs.build-test-publish-wheels.outputs.version }}
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.17.0
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/_update_dependencies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
TARGET_BRANCH: ${{ inputs.target-branch }}
steps:
- name: Checkout repo
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
ref: ${{ env.TARGET_BRANCH }}

Expand All @@ -60,7 +60,7 @@ jobs:
fi

- name: Checkout repo
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
ref: ${{ env.SOURCE_BRANCH }}

Expand All @@ -77,7 +77,7 @@ jobs:
bash -c 'uv lock --upgrade'

- name: Upload lock file
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v6
with:
name: lock-file-${{ env.SOURCE_BRANCH }}
path: uv.lock
Expand All @@ -90,7 +90,7 @@ jobs:
TARGET_BRANCH: ${{ inputs.target-branch }}
steps:
- name: Checkout code
uses: actions/checkout@v4
uses: actions/checkout@v6
with:
token: ${{ secrets.PAT }}
ref: ${{ env.TARGET_BRANCH }}
Expand All @@ -103,7 +103,7 @@ jobs:
fi

- name: Download lock file
uses: actions/download-artifact@v4
uses: actions/download-artifact@v7
with:
name: lock-file-${{ env.SOURCE_BRANCH }}

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/auto-reminder-bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ jobs:
if: github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Check out repository code
uses: actions/checkout@v4
uses: actions/checkout@v6

- name: Set up Python
uses: actions/setup-python@v5
uses: actions/setup-python@v6
with:
python-version: "3.10"

Expand Down
Loading
Loading