diff --git a/.github/actions/action.yml b/.github/actions/action.yml
index decaa5ff3f8..7e2faf936b4 100644
--- a/.github/actions/action.yml
+++ b/.github/actions/action.yml
@@ -59,7 +59,7 @@ runs:
run: echo "node_name=$NODE_NAME" | tee -a "$GITHUB_OUTPUT"
- name: Checkout repository
- uses: actions/checkout@v2
+ uses: actions/checkout@v6
- name: Change ownership of /home/runner/
shell: bash
@@ -98,7 +98,8 @@ runs:
--environment dev \
--platform dgx_h100 \
--tag ${{ inputs.tag }} \
- --container-image ${{ inputs.container-image }}
+ --container-image ${{ inputs.container-image }} \
+ --hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME
RUN_TEST_EOF
)
@@ -186,6 +187,7 @@ runs:
--platform dgx_h100 \
--container-image ${{ inputs.container-image }} \
--data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \
+ --hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME
RUN_TEST_EOF
)
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
index fed1792d2f9..8c1ef48cf00 100644
--- a/.github/copy-pr-bot.yaml
+++ b/.github/copy-pr-bot.yaml
@@ -1,4 +1,4 @@
enabled: true
auto_sync_draft: false
auto_sync_ready: true
-trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
+trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "ksivaman", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
diff --git a/.github/oncall_schedule.json b/.github/oncall_schedule.json
index 58fcf0ddbbc..fe90397c2ad 100644
--- a/.github/oncall_schedule.json
+++ b/.github/oncall_schedule.json
@@ -1,16 +1,4 @@
[
- {
- "user": "janEbert",
- "date": "2026-02-18"
- },
- {
- "user": "asolergi-nv",
- "date": "2026-02-25"
- },
- {
- "user": "BoxiangW",
- "date": "2026-03-04"
- },
{
"user": "maanug-nv",
"date": "2026-03-11"
@@ -20,31 +8,43 @@
"date": "2026-03-18"
},
{
- "user": "gautham-kollu",
+ "user": "janEbert",
"date": "2026-03-25"
},
{
- "user": "janEbert",
+ "user": "gautham-kollu",
"date": "2026-04-01"
},
{
- "user": "maanug-nv",
+ "user": "ilml",
"date": "2026-04-08"
},
{
- "user": "BoxiangW",
+ "user": "Phlip79",
"date": "2026-04-15"
},
{
- "user": "Phlip79",
+ "user": "asolergi-nv",
"date": "2026-04-22"
},
{
- "user": "asolergi-nv",
+ "user": "BoxiangW",
"date": "2026-04-29"
},
{
- "user": "dimapihtar",
+ "user": "maanug-nv",
"date": "2026-05-06"
+ },
+ {
+ "user": "dimapihtar",
+ "date": "2026-05-13"
+ },
+ {
+ "user": "gautham-kollu",
+ "date": "2026-05-20"
+ },
+ {
+ "user": "ilml",
+ "date": "2026-05-27"
}
]
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 5cd5138eb69..d2825f9c34b 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -5,19 +5,8 @@
## Contribution process
-```mermaid
-flowchart LR
- A[Pre-checks] --> B[PR Tests]
- subgraph Code Review/Approval
- C1[Expert Review] --> C2[Final Review]
- end
- B --> C1
- C2 --> D[Merge]
-```
-
### Pre-checks
-- [ ] I want this PR in a versioned release and have added the appropriate Milestone (e.g., `Core 0.8`)
- [ ] I have added relevant unit tests
- [ ] I have added relevant functional tests
- [ ] I have added proper typing to my code [Typing guidelines](https://docs.python.org/3/library/typing.html)
@@ -26,33 +15,32 @@ flowchart LR
### Code review
-The following process is enforced via the CODEOWNERS file for changes into `megatron/core`. For changes outside of `megatron/core`, it is up to the PR author whether or not to tag the Final Reviewer team.
+Feel free to message or comment the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) to help accelerate your merge into main. The less complex your PR is, the faster it will be approved and merged!
-
-For MRs into `main` branch
+All PRs start as **draft**. If you open a non-draft PR, it will be automatically converted to draft.
-Feel free to message or comment the @mcore-oncall to help accelerate your merge into main. The less complex your PR is, the faster it will be approved and merged!
+#### Step 1: Mark PR as "Ready for Review"
-#### (Step 1): Add PR label `Expert Review`
+1. When your PR is ready, click **Ready for Review**.
+2. An oncall reviewer is auto-assigned and expert reviewers are notified based on your changes.
+ - Some PRs may jump straight to step 2. This is determined by `.github/CODEOWNERS`.
-#### (Step 2): Collect the expert reviewers reviews
+:warning: Only mark as ready once merge-conflicts are resolved and the CI is passing.
+Final Review might get declined if these requirements are not fulfilled.
-1. Attach the `Expert Review` label when your PR is ready for review.
-2. GitHub auto-assigns expert reviewers based on your changes. They will get notified and pick up your PR soon.
+#### Step 2: Final Review
-:warning: Only proceed to the next step once all reviewers have approved, merge-conflict are resolved and the CI is passing.
-Final Review might get declined if these requirements are not fulfilled.
+For PRs that change `megatron/core`, once all expert reviewers have approved, the `Final Review` label is applied **automatically** and final reviewers are assigned.
-#### (Step 3): Final Review
+For PRs outside `megatron/core`, this step is skipped.
-1. Add `Final Review` label
-2. GitHub auto-assigns final reviewers based on your changes. They will get notified and pick up your PR soon.
+#### Step 3: Approved
-#### (Optional Step 4): Cherry-pick into release branch
+Once all required reviewers have approved, the `Approved` label is applied **automatically**.
-If this PR also needs to be merged into `core_r*` release branches, after this PR has been merged, select `Cherry-pick` to open a new PR into the release branch.
+### Merge
-
+Any member of [mcore-engineers](https://github.com/orgs/NVIDIA/teams/mcore-engineers) will be able to merge your PR.
For MRs into `dev` branch
@@ -60,7 +48,3 @@ The proposed review process for `dev` branch is under active discussion.
MRs are mergable after one approval by either `eharper@nvidia.com` or `zijiey@nvidia.com`.
-
-### Merging your PR
-
-Any member of [core-adlr](https://github.com/orgs/teams/NVIDIA/core-adlr) and [`core-nemo`](https://github.com/orgs/teams/NVIDIA/core-nemo) will be able to merge your PR.
diff --git a/.github/workflows/_build_test_publish_wheel.yml b/.github/workflows/_build_test_publish_wheel.yml
index 0b71577b587..f29488c4494 100644
--- a/.github/workflows/_build_test_publish_wheel.yml
+++ b/.github/workflows/_build_test_publish_wheel.yml
@@ -43,7 +43,7 @@ jobs:
PUBLISH_DRYRUN: ${{ inputs.dry-run }}
steps:
- name: Checkout repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
ref: ${{ inputs.ref }}
@@ -136,7 +136,7 @@ jobs:
test "${{ steps.build-wheel.outputs.expected-release-number }}" == "$RELEASE_NUMBER"
- name: Upload wheels
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
with:
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
path: dist/
@@ -159,7 +159,7 @@ jobs:
PACKAGE: ${{ matrix.PACKAGE }}
steps:
- name: Download wheels
- uses: actions/download-artifact@v4
+ uses: actions/download-artifact@v7
with:
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
path: dist/
diff --git a/.github/workflows/_release_library.yml b/.github/workflows/_release_library.yml
index 684dacc27aa..46de1bb3a66 100644
--- a/.github/workflows/_release_library.yml
+++ b/.github/workflows/_release_library.yml
@@ -53,6 +53,11 @@ on:
description: Starting tag for changelog builder (leave empty for auto-detect)
type: string
default: ""
+ publish-docs:
+ required: false
+ description: Publish documentation to S3 after release
+ type: boolean
+ default: true
secrets:
TWINE_PASSWORD:
required: true
@@ -60,6 +65,22 @@ on:
required: true
PAT:
required: true
+ AWS_ASSUME_ROLE_ARN:
+ required: true
+ AWS_ACCESS_KEY_ID:
+ required: true
+ AWS_SECRET_ACCESS_KEY:
+ required: true
+ AKAMAI_HOST:
+ required: true
+ AKAMAI_CLIENT_TOKEN:
+ required: true
+ AKAMAI_CLIENT_SECRET:
+ required: true
+ AKAMAI_ACCESS_TOKEN:
+ required: true
+ S3_BUCKET_NAME:
+ required: true
permissions:
contents: write # To read repository content
@@ -89,7 +110,7 @@ jobs:
IS_DRY_RUN: ${{ inputs.dry-run }}
steps:
- name: Checkout repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
token: ${{ secrets.PAT }}
@@ -199,11 +220,8 @@ jobs:
# Extract PR number from URL
PR_NUMBER=$(echo $PR_URL | grep -o '[0-9]*$')
- # Add comment to the newly created PR
- echo gh pr comment $PR_NUMBER --body "/ok to test $(git rev-parse HEAD)"
-
- name: Wait for status checks on tmp branch
- uses: actions/github-script@v7
+ uses: actions/github-script@v8
id: wait-status
with:
github-token: ${{ secrets.PAT }}
@@ -326,7 +344,6 @@ jobs:
ref: ${{ inputs.release-ref }}
no-publish: false
secrets:
- TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
create-gh-release:
@@ -344,10 +361,10 @@ jobs:
REPOSITORY: ${{ github.repository }}
PROJECT_NAME: Megatron Core
VERSION: ${{ needs.bump-next-version.outputs.release-version }}
- TAG_PREFIX: ${{ inputs.gh-release-tag-prefix || '' }}
+ TAG_PREFIX: core_
steps:
- name: Checkout repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
ref: ${{ inputs.release-ref }}
@@ -455,6 +472,12 @@ jobs:
publish-docs:
needs: [bump-next-version, create-gh-release]
uses: ./.github/workflows/release-docs.yml
+ if: |
+ (
+ success() || !failure()
+ )
+ && inputs.publish-docs == true
+ && !cancelled()
with:
dry-run: ${{ inputs.dry-run }}
publish-as-latest: true
@@ -472,7 +495,7 @@ jobs:
VERSION: ${{ needs.build-test-publish-wheels.outputs.version }}
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.17.0
diff --git a/.github/workflows/_update_dependencies.yml b/.github/workflows/_update_dependencies.yml
index a60e69f701b..903d773edbd 100644
--- a/.github/workflows/_update_dependencies.yml
+++ b/.github/workflows/_update_dependencies.yml
@@ -33,7 +33,7 @@ jobs:
TARGET_BRANCH: ${{ inputs.target-branch }}
steps:
- name: Checkout repo
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
ref: ${{ env.TARGET_BRANCH }}
@@ -60,7 +60,7 @@ jobs:
fi
- name: Checkout repo
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
ref: ${{ env.SOURCE_BRANCH }}
@@ -77,7 +77,7 @@ jobs:
bash -c 'uv lock --upgrade'
- name: Upload lock file
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
with:
name: lock-file-${{ env.SOURCE_BRANCH }}
path: uv.lock
@@ -90,7 +90,7 @@ jobs:
TARGET_BRANCH: ${{ inputs.target-branch }}
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
token: ${{ secrets.PAT }}
ref: ${{ env.TARGET_BRANCH }}
@@ -103,7 +103,7 @@ jobs:
fi
- name: Download lock file
- uses: actions/download-artifact@v4
+ uses: actions/download-artifact@v7
with:
name: lock-file-${{ env.SOURCE_BRANCH }}
diff --git a/.github/workflows/auto-reminder-bot.yml b/.github/workflows/auto-reminder-bot.yml
index 37e6e5498e3..72a48e9539e 100644
--- a/.github/workflows/auto-reminder-bot.yml
+++ b/.github/workflows/auto-reminder-bot.yml
@@ -14,10 +14,10 @@ jobs:
if: github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Check out repository code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: "3.10"
diff --git a/.github/workflows/auto-swap-labels.yml b/.github/workflows/auto-swap-labels.yml
index 5335026e2af..f1dd9757c8a 100644
--- a/.github/workflows/auto-swap-labels.yml
+++ b/.github/workflows/auto-swap-labels.yml
@@ -2,32 +2,74 @@
name: Auto Swap Labels
on:
- pull_request_review:
- types: [submitted]
+ pull_request_target:
+ types: [ready_for_review, synchronize]
+ branches:
+ - main
+ workflow_run:
+ workflows: ["Review Trigger"]
+ types: [completed]
permissions:
pull-requests: write
contents: read
+ actions: read
jobs:
check-approval:
runs-on: ubuntu-latest
- if: github.event.review.state == 'approved' && github.repository == 'NVIDIA/Megatron-LM'
+ if: >-
+ github.repository == 'NVIDIA/Megatron-LM' && (
+ (github.event_name == 'pull_request_target' &&
+ github.event.pull_request.base.ref == 'main' &&
+ !github.event.pull_request.draft) ||
+ (github.event_name == 'workflow_run' &&
+ github.event.workflow_run.conclusion == 'success')
+ )
+
steps:
+ - name: Get PR number from workflow_run
+ id: get-pr
+ if: github.event_name == 'workflow_run'
+ continue-on-error: true
+ uses: actions/download-artifact@v4
+ with:
+ name: pr-number
+ path: pr-number
+ github-token: ${{ github.token }}
+ run-id: ${{ github.event.workflow_run.id }}
+
+ - name: Set PR number
+ id: pr
+ run: |
+ if [ "${{ github.event_name }}" = "workflow_run" ]; then
+ if [ "${{ steps.get-pr.outcome }}" != "success" ]; then
+ echo "No approval artifact found — review was not an approval. Skipping."
+ exit 0
+ fi
+ echo "number=$(cat pr-number/number)" >> $GITHUB_OUTPUT
+ else
+ echo "number=${{ github.event.pull_request.number }}" >> $GITHUB_OUTPUT
+ fi
+
- name: Check out repository code
+ if: steps.pr.outputs.number
uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v5
+ if: steps.pr.outputs.number
+ uses: actions/setup-python@v6
with:
python-version: "3.10"
- name: Install dependencies
+ if: steps.pr.outputs.number
run: |
pip install --no-cache-dir PyGithub slack-sdk
- - name: Run Auto Reminder Bot
+ - name: Run Auto Swap Labels
+ if: steps.pr.outputs.number
run: |
- export GH_TOKEN=${{ github.token }}
- export PR_NUMBER=${{ github.event.pull_request.number }}
+ export GH_TOKEN=${{ secrets.PAT }}
+ export PR_NUMBER=${{ steps.pr.outputs.number }}
python tests/test_utils/python_scripts/swap_pr_labels.py
diff --git a/.github/workflows/auto-update-copy-pr-bot.yml b/.github/workflows/auto-update-copy-pr-bot.yml
index 3358a747f34..07fdcfbfbb8 100644
--- a/.github/workflows/auto-update-copy-pr-bot.yml
+++ b/.github/workflows/auto-update-copy-pr-bot.yml
@@ -11,7 +11,7 @@ jobs:
if: github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Checkout code
- uses: actions/checkout@v3
+ uses: actions/checkout@v6
with:
token: ${{ secrets.PAT }}
ref: main
diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
new file mode 100644
index 00000000000..1bc7dfcb6e4
--- /dev/null
+++ b/.github/workflows/build-docs.yml
@@ -0,0 +1,65 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Build docs
+
+on:
+ push:
+ branches:
+ - main
+ - "pull-request/[0-9]+"
+ - "deploy-release/*"
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
+ cancel-in-progress: true
+
+jobs:
+ pre-flight:
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
+
+ build-docs:
+ needs: [pre-flight]
+ if: needs.pre-flight.outputs.is_deployment_workflow != 'true'
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0
+
+ build-docs-summary:
+ needs: [pre-flight, build-docs]
+ if: |
+ (
+ needs.pre-flight.outputs.is_deployment_workflow == 'true'
+ || always()
+ )
+ && !cancelled()
+ runs-on: ubuntu-latest
+ steps:
+ - name: Get workflow result
+ id: result
+ shell: bash -x -e -u -o pipefail {0}
+ env:
+ GH_TOKEN: ${{ github.token }}
+ RUN_ID: ${{ github.run_id }}
+ SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
+ run: |
+ FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+
+ if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+ echo "✅ All previous jobs completed successfully"
+ exit 0
+ else
+ echo "❌ Found $FAILED_JOBS failed job(s)"
+ # Show which jobs failed
+ gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
+ exit 1
+ fi
diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml
index 00711b50806..88b9e8b8c61 100644
--- a/.github/workflows/build-test-publish-wheel.yml
+++ b/.github/workflows/build-test-publish-wheel.yml
@@ -58,7 +58,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Result
env:
diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml
index 2cba41eafb8..02bfbbfd3a5 100644
--- a/.github/workflows/cicd-approve-test-queue.yml
+++ b/.github/workflows/cicd-approve-test-queue.yml
@@ -27,12 +27,13 @@ jobs:
strategy:
matrix:
branch: [main, dev, others]
+ contributor_type: [internal, external]
steps:
- name: Checkout repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: "3.12"
@@ -41,23 +42,44 @@ jobs:
python -m pip install --upgrade pip
pip install requests
+ - name: Download SSO users list
+ run: |
+ gh release download v0.1.0 \
+ --repo NVIDIA-GitHub-Management/github-audits \
+ --pattern users_sso.json \
+ --output users_sso.json || echo '{}' > users_sso.json
+ env:
+ GH_TOKEN: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
+
- name: Approve waiting deployments
env:
GITHUB_TOKEN: ${{ secrets.PAT }}
MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
+ MAX_CONCURRENCY_EXTERNAL: ${{ vars.MAX_CONCURRENCY_EXTERNAL || 3 }}
+ CONTRIBUTOR_TYPE: ${{ matrix.contributor_type }}
+ SSO_USERS_FILE: users_sso.json
PYTHONUNBUFFERED: 1
shell: python
run: |
import os
+ import json
import requests
import re
# GitHub API configuration
GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
REPO = os.environ["GITHUB_REPOSITORY"]
- MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2
+ CONTRIBUTOR_TYPE = os.environ["CONTRIBUTOR_TYPE"]
+ if CONTRIBUTOR_TYPE == "external":
+ MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"]) // 2
+ else:
+ MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2
API_BASE = f"https://api.github.com/repos/NVIDIA/Megatron-LM"
+ # Load SSO users for internal/external classification
+ with open(os.environ["SSO_USERS_FILE"]) as f:
+ sso_users = json.load(f)
+
# Headers for GitHub API
headers = {
"Authorization": f"token {GITHUB_TOKEN}",
@@ -81,53 +103,76 @@ jobs:
print(f"Response: {e.response.text}")
return None
- def is_pr_targeting_branch(workflow_run, target_branch):
+ def is_internal_contributor(pr_info):
+ """Return True if the PR author is a member of NVIDIA or NVIDIA-NeMo org (is_org_member)."""
+ login = pr_info.get("user", {}).get("login", "")
+ org_roles = sso_users.get(login, {}).get("org_roles", [])
+ return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles)
+
+ def get_pr_base_branch(workflow_run):
"""
- Check if a workflow run belongs to a PR targeting the given branch.
- Extract PR number from head branch like 'pull-request/1913' and verify base branch.
+ Return the base branch of the PR associated with a workflow run, or None.
+ Extracts PR number from head branch like 'pull-request/1913' and fetches PR info.
+ Returns (base_branch, pr_info) tuple, or (None, None) if not a PR run.
"""
print(workflow_run.get("head_branch", ""))
head_branch = workflow_run.get("head_branch", "")
match = re.match(r"pull-request/(\d+)", head_branch)
if not match:
- return False # Not a PR branch pattern
+ return None, None # Not a PR branch pattern
pr_number = int(match.group(1))
-
+
# Fetch PR info from GitHub API
pr_info = make_request(f"pulls/{pr_number}")
if not pr_info:
print(f"Failed to fetch PR #{pr_number}")
- return False
+ return None, None
base_branch = pr_info.get("base", {}).get("ref")
- if (
- (base_branch == target_branch) or
- (base_branch != "main" and base_branch != "dev" and target_branch == "others")
- ):
- print(f"PR #{pr_number} targets {target_branch}")
- return True
+ return base_branch, pr_info
+
+ def matches_queue(workflow_run, target_branch, contributor_type):
+ """
+ Return True if the workflow run belongs to this queue cell:
+ matching target branch AND matching contributor type (internal/external).
+ """
+ base_branch, pr_info = get_pr_base_branch(workflow_run)
+ if base_branch is None:
+ return False
+
+ branch_match = (
+ (base_branch == target_branch) or
+ (base_branch != "main" and base_branch != "dev" and target_branch == "others")
+ )
+ if not branch_match:
+ return False
- return False
+ pr_number = re.match(r"pull-request/(\d+)", workflow_run.get("head_branch", "")).group(1)
+ internal = is_internal_contributor(pr_info)
+ contributor_match = (contributor_type == "internal") == internal
+ if branch_match and contributor_match:
+ print(f"PR #{pr_number} targets {target_branch}, contributor_type={contributor_type} (internal={internal})")
+ return branch_match and contributor_match
# Get current running and queued workflows
print("Fetching workflow runs...")
queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", [])
in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", [])
- # Filter for workflows belonging to PRs targeting ${{ matrix.branch }}
- queued_workflow_runs = [run for run in queued_workflow_runs
- if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")]
- in_progress_workflow_runs = [run for run in in_progress_workflow_runs
- if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")]
+ # Filter for workflows belonging to PRs targeting ${{ matrix.branch }} with matching contributor type
+ queued_workflow_runs = [run for run in queued_workflow_runs
+ if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
+ in_progress_workflow_runs = [run for run in in_progress_workflow_runs
+ if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
# Count running and queued workflows
queued_workflows = len(queued_workflow_runs)
in_progress_workflows = len(in_progress_workflow_runs)
total_workflows = queued_workflows + in_progress_workflows
- print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}): {queued_workflows}")
- print(f"Current running workflows (PRs targeting ${{ matrix.branch }}): {in_progress_workflows}")
+ print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {queued_workflows}")
+ print(f"Current running workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {in_progress_workflows}")
print(f"Total workflows: {total_workflows}")
print(f"Max concurrency: {MAX_CONCURRENCY}")
@@ -139,8 +184,8 @@ jobs:
print("Fetching deployments...")
pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", [])
print("Pending workflows:", len(pending_workflows))
- pending_workflows = [run for run in pending_workflows
- if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")]
+ pending_workflows = [run for run in pending_workflows
+ if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
# Sort deployments by creation date (oldest first)
print("Sorting workflows...")
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 2cc025baf99..f63011df63f 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -25,7 +25,7 @@ on:
workflow_dispatch:
concurrency:
- group: ${{ github.workflow }}-${{ github.head_ref || github.ref || github.event.pull_request.number }}
+ group: ${{ github.workflow }}-${{ github.head_ref || github.event.merge_group.head_ref || github.ref }}
cancel-in-progress: true
permissions:
@@ -52,7 +52,7 @@ jobs:
DISABLE_EXTERNAL_CONTRIBUTOR: ${{ vars.DISABLE_EXTERNAL_CONTRIBUTOR }}
steps:
- name: Checkout repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
token: ${{ env.GITHUB_TOKEN }}
@@ -145,7 +145,7 @@ jobs:
)
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -208,7 +208,7 @@ jobs:
mbridge-test-suite: ${{ steps.select-mbridge-test-suite.outputs.main }}
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Get PR info
id: get-pr-info
@@ -251,7 +251,7 @@ jobs:
uses: nv-gha-runners/get-pr-info@main
- name: Checkout MBridge and create testing branch
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
ref: main
repository: NVIDIA-NeMo/Megatron-Bridge
@@ -347,12 +347,12 @@ jobs:
echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT"
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
ref: ${{ steps.sha.outputs.main }}
- name: Setup python
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: 3.12
@@ -470,7 +470,7 @@ jobs:
&& !cancelled()
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Parse unit tests
id: parse-unit-tests
run: |
@@ -510,7 +510,7 @@ jobs:
PIP_ROOT_USER_ACTION: ignore
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: main
uses: ./.github/actions
with:
@@ -545,7 +545,7 @@ jobs:
integration-tests: ${{ steps.main.outputs.integration-tests }}
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Get PR info
id: get-pr-info
@@ -649,7 +649,7 @@ jobs:
&& !cancelled()
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: main
uses: ./.github/actions
with:
@@ -682,7 +682,7 @@ jobs:
permissions: write-all
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Get workflow result
id: result
@@ -719,7 +719,7 @@ jobs:
&& github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Generate fake coverage report
- uses: actions/github-script@v6
+ uses: actions/github-script@v8
with:
github-token: ${{ secrets.PAT }}
script: |
@@ -747,10 +747,10 @@ jobs:
flag: [unit-test]
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Download coverage reports of current branch
- uses: actions/download-artifact@v4
+ uses: actions/download-artifact@v7
with:
pattern: coverage-${{ matrix.flag }}-*
@@ -778,7 +778,7 @@ jobs:
flags: ${{ matrix.flag }}
- name: Upload artifacts
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
with:
name: coverage-${{ matrix.flag }}-aggregated
path: |
@@ -799,7 +799,7 @@ jobs:
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
- name: Comment on PR with action run URL
- uses: actions/github-script@v7
+ uses: actions/github-script@v8
with:
github-token: ${{ secrets.PAT }}
script: |
@@ -826,6 +826,7 @@ jobs:
always()
&& !cancelled()
&& contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral')
+ && !needs.pre-flight.outputs.is_deployment_workflow == 'true'
steps:
- name: Taint node for cleanup
shell: bash
diff --git a/.github/workflows/claude-complexity-label.yml b/.github/workflows/claude-complexity-label.yml
new file mode 100644
index 00000000000..356eed2da29
--- /dev/null
+++ b/.github/workflows/claude-complexity-label.yml
@@ -0,0 +1,60 @@
+name: Claude Complexity Label
+
+on:
+ pull_request_target:
+ types: [ready_for_review]
+
+jobs:
+ label-complexity:
+ name: Label PR Complexity
+ runs-on: ubuntu-latest
+ permissions:
+ contents: read
+ pull-requests: write
+ issues: write
+ id-token: write
+ env:
+ GH_TOKEN: ${{ secrets.PAT }}
+ REPO: ${{ github.repository }}
+ PR_NUMBER: ${{ github.event.pull_request.number }}
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v6
+ with:
+ fetch-depth: 0
+
+ - name: Run Claude Complexity Analysis
+ uses: anthropics/claude-code-action@v1
+ with:
+ anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
+ github_token: ${{ secrets.PAT }}
+ prompt: |
+ REPO: ${{ env.REPO }}
+ PR NUMBER: ${{ env.PR_NUMBER }}
+
+ You are a PR complexity analyzer. Your job is to analyze the diff of this PR and apply exactly one complexity label.
+
+ STEPS:
+ 1. Get the PR diff by running: gh pr diff $PR_NUMBER --repo $REPO
+ 2. Analyze every changed line (added or removed) in the diff and classify each as one of:
+ - "docs-only": changes to docstrings, comments (lines starting with # or //), documentation files (.md, .rst, .txt), or similar non-functional text
+ - "test": changes in test files (files with "test" in the name/path, or inside a tests/ directory)
+ - "real code": all other changes (functional source code)
+ 3. Compute "real code line changes" using this formula:
+ real_code_line_changes = (number of real code lines changed) + (number of test lines changed / 10)
+ Count both added and removed lines. Do not count unchanged context lines. Do not count comments or docstrings.
+ 4. Remove any previously applied complexity or docs-only labels:
+ gh pr edit $PR_NUMBER --repo $REPO --remove-label "complexity: low,complexity: medium,complexity: high,docs-only"
+ 5. Apply exactly ONE label using the gh CLI:
+ - If there are ZERO real code lines and ZERO test lines (only docs-only changes), apply label "docs-only":
+ gh pr edit $PR_NUMBER --repo $REPO --add-label "docs-only"
+ - If real_code_line_changes < 100, apply label "complexity: low":
+ gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: low"
+ - If real_code_line_changes >= 100 and < 500, apply label "complexity: medium":
+ gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: medium"
+ - If real_code_line_changes >= 500, apply label "complexity: high":
+ gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: high"
+
+ Do NOT post any comments on the PR. Only apply the label.
+ claude_args: |
+ --allowedTools "Bash(gh pr diff:*),Bash(gh pr edit:*),Bash(gh pr view:*)"
diff --git a/.github/workflows/claude_review.yml b/.github/workflows/claude_review.yml
new file mode 100644
index 00000000000..2a159c07d20
--- /dev/null
+++ b/.github/workflows/claude_review.yml
@@ -0,0 +1,67 @@
+name: Claude Code Review
+
+on:
+ issue_comment:
+ types: [created]
+
+jobs:
+ review-on-comment:
+ name: Claude Review (comment trigger)
+ if: |
+ github.event_name == 'issue_comment' &&
+ github.event.issue.pull_request &&
+ contains(github.event.comment.body, '/claude review')
+ runs-on: ubuntu-latest
+ permissions:
+ contents: read
+ pull-requests: write
+ issues: write
+ id-token: write
+ env:
+ GH_TOKEN: ${{ github.token }}
+ REPO: ${{ github.repository }}
+ PR_NUMBER: ${{ github.event.issue.number }}
+ steps:
+ - name: Get PR head commit
+ id: get-pr-head-commit
+ run: |
+ echo "sha=$(gh pr view $PR_NUMBER --repo $REPO --json headRefOid -q .headRefOid)" | tee -a $GITHUB_OUTPUT
+
+ - name: Checkout repository
+ uses: actions/checkout@v6
+ with:
+ fetch-depth: 1
+ ref: ${{ steps.get-pr-head-commit.outputs.sha }}
+
+ - name: Run Claude Code Review
+ uses: anthropics/claude-code-action@v1
+ with:
+ anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
+ trigger_phrase: "/claude review"
+ show_full_output: true
+ claude_args: |
+ --allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr review:*)"
+ --model "claude-opus-4-6"
+ prompt: |
+ REPO: ${{ env.REPO }}
+ PR NUMBER: ${{ env.PR_NUMBER }}
+
+ You are doing a light code review. Keep it concise and actionable.
+
+ Focus ONLY on:
+ - Critical bugs or logic errors
+ - Typos in code, comments, or strings
+ - Missing or insufficient test coverage for changed code
+ - Outdated or inaccurate documentation affected by the changes
+
+ Do NOT comment on:
+ - Style preferences or formatting
+ - Minor naming suggestions
+ - Architectural opinions or refactoring ideas
+ - Performance unless there is a clear, measurable issue
+
+ Provide feedback using inline comments for specific code suggestions.
+ Use top-level comments for general observations.
+
+ It's perfectly acceptable to not have anything to comment on.
+ If you do not have anything to comment on, post "LGTM".
diff --git a/.github/workflows/config/changelog-config.json b/.github/workflows/config/changelog-config.json
index e640b90a0f3..19fb0e42364 100644
--- a/.github/workflows/config/changelog-config.json
+++ b/.github/workflows/config/changelog-config.json
@@ -15,7 +15,7 @@
},
"transformers": [],
"max_tags_to_fetch": 100,
- "max_pull_requests": 500,
+ "max_pull_requests": 1250,
"max_back_track_time_days": 365,
"exclude_merge_branches": [],
"tag_resolver": {
diff --git a/.github/workflows/copyright-check.yml b/.github/workflows/copyright-check.yml
index a7f51cd8a0e..33d30944f8d 100644
--- a/.github/workflows/copyright-check.yml
+++ b/.github/workflows/copyright-check.yml
@@ -49,7 +49,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Result
env:
diff --git a/.github/workflows/force-draft-pr.yml b/.github/workflows/force-draft-pr.yml
new file mode 100644
index 00000000000..d45dabf14b7
--- /dev/null
+++ b/.github/workflows/force-draft-pr.yml
@@ -0,0 +1,36 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+name: Force Draft PR
+
+on:
+ pull_request_target:
+ types: [opened]
+ branches:
+ - main
+
+permissions:
+ pull-requests: write
+
+jobs:
+ force-draft:
+ runs-on: ubuntu-latest
+ if: ${{ !github.event.pull_request.draft && github.repository == 'NVIDIA/Megatron-LM' }}
+ steps:
+ - name: Convert PR to draft
+ env:
+ GH_TOKEN: ${{ secrets.PAT }}
+ run: |
+ gh pr ready --undo ${{ github.event.pull_request.number }} --repo ${{ github.repository }}
+
+ - name: Add comment explaining draft policy
+ env:
+ GH_TOKEN: ${{ github.token }}
+ run: |
+ gh pr comment ${{ github.event.pull_request.number }} --repo ${{ github.repository }} --body \
+ "This PR has been automatically converted to **draft** because all PRs must start as drafts.
+
+ When you are ready for review, click **Ready for Review** to begin the review process. This will:
+ 1. Add the oncall reviewer (optional reviewer)
+ 2. Add required review teams based on your changes
+
+ See the [contribution guide](https://github.com/NVIDIA/Megatron-LM/blob/main/docs/developer/submit.md) for more details."
diff --git a/.github/workflows/install-test.yml b/.github/workflows/install-test.yml
index 5a0abb8596d..060e1c5ade0 100644
--- a/.github/workflows/install-test.yml
+++ b/.github/workflows/install-test.yml
@@ -49,7 +49,7 @@ jobs:
python-version: ["3.12"]
steps:
- name: Checkout repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set PATH
run: |
@@ -65,7 +65,7 @@ jobs:
run: bash docker/common/install.sh --environment dev --base-image pytorch --python-version ${{ matrix.python-version }}
- name: Checkout check-imports
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.63.2
@@ -94,7 +94,7 @@ jobs:
python-version: ["3.12"]
steps:
- name: Checkout repository
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set PATH
run: |
@@ -113,7 +113,7 @@ jobs:
# NGC PyTorch 25.05 has a version of triton that is broken on CPU only machines.
# - name: Checkout check-imports
- # uses: actions/checkout@v4
+ # uses: actions/checkout@v6
# with:
# repository: NVIDIA-NeMo/FW-CI-templates
# ref: v0.63.2
@@ -139,7 +139,7 @@ jobs:
&& github.repository == 'NVIDIA/Megatron-LM'
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Get workflow result
id: result
diff --git a/.github/workflows/oncall-assign.yml b/.github/workflows/oncall-assign.yml
index d4cc47d5f9e..6da0776ffc2 100644
--- a/.github/workflows/oncall-assign.yml
+++ b/.github/workflows/oncall-assign.yml
@@ -16,7 +16,7 @@ name: Oncall Assign
on:
pull_request_target:
- types: [opened, ready_for_review]
+ types: [ready_for_review]
branches:
- main
@@ -30,10 +30,10 @@ jobs:
if: ${{ !github.event.pull_request.draft }}
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v6
with:
python-version: '3.10'
diff --git a/.github/workflows/oncall-rotation.yml b/.github/workflows/oncall-rotation.yml
index a621be7f652..0d5f774e441 100644
--- a/.github/workflows/oncall-rotation.yml
+++ b/.github/workflows/oncall-rotation.yml
@@ -28,12 +28,12 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
with:
token: ${{ secrets.PAT }}
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v6
with:
python-version: "3.10"
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 647e6af2379..a756d49eb20 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -44,6 +44,11 @@ on:
description: Branch for version bump
required: true
type: string
+ gh-release-from-tag:
+ description: Tag of previous release for changelog builder
+ required: false
+ type: string
+ default: ""
permissions:
contents: write # To read repository content
@@ -59,7 +64,16 @@ jobs:
create-gh-release: ${{ inputs.create-gh-release || true }}
gh-release-use-changelog-builder: ${{ inputs.generate-changelog }}
publish-docs: ${{ inputs.publish-docs }}
+ gh-release-from-tag: ${{ inputs.gh-release-from-tag }}
secrets:
TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}
SLACK_WEBHOOK: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SLACK_MAIN_CHANNEL_WEBHOOK || secrets.SLACK_CI_CHANNEL_WEBHOOK }}
PAT: ${{ secrets.PAT }}
+ AWS_ASSUME_ROLE_ARN: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+ AKAMAI_HOST: ${{ secrets.AKAMAI_HOST }}
+ AKAMAI_CLIENT_TOKEN: ${{ secrets.AKAMAI_CLIENT_TOKEN }}
+ AKAMAI_CLIENT_SECRET: ${{ secrets.AKAMAI_CLIENT_SECRET }}
+ AKAMAI_ACCESS_TOKEN: ${{ secrets.AKAMAI_ACCESS_TOKEN }}
+ S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }}
diff --git a/.github/workflows/review-trigger.yml b/.github/workflows/review-trigger.yml
new file mode 100644
index 00000000000..28abf259882
--- /dev/null
+++ b/.github/workflows/review-trigger.yml
@@ -0,0 +1,28 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Lightweight workflow that triggers on review approval, otherwise there is no access to right secret.
+# No secrets needed — just signals auto-swap-labels.yml via workflow_run.
+
+name: Review Trigger
+
+on:
+ pull_request_review:
+ types: [submitted]
+
+jobs:
+ signal:
+ runs-on: ubuntu-latest
+ if: >-
+ github.event.review.state == 'approved' &&
+ github.event.pull_request.base.ref == 'main' &&
+ github.repository == 'NVIDIA/Megatron-LM'
+ steps:
+ - name: Save PR number
+ run: |
+ mkdir -p pr
+ echo "${{ github.event.pull_request.number }}" > pr/number
+ - name: Upload PR number
+ uses: actions/upload-artifact@v4
+ with:
+ name: pr-number
+ path: pr/
diff --git a/.github/workflows/sync-team-usergroups.yml b/.github/workflows/sync-team-usergroups.yml
index fb48a6ca5d4..7f32ac55c57 100644
--- a/.github/workflows/sync-team-usergroups.yml
+++ b/.github/workflows/sync-team-usergroups.yml
@@ -24,10 +24,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout code
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v6
with:
python-version: "3.10"
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a238f2c9999..2eb1b43be0c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,16 +1,16 @@
.merge_train_rule: &merge_train_rule
- UNIT_TEST: 'yes'
+ UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
- INTEGRATION_TEST: 'no'
+ INTEGRATION_TEST: "no"
INTEGRATION_TEST_SCOPE: mr
- FUNCTIONAL_TEST: 'yes'
+ FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr-slim
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_TIME_LIMIT: 2700
- CLUSTER_A100: ''
- CLUSTER_H100: ''
- PUBLISH: 'no'
+ CLUSTER_A100: ""
+ CLUSTER_H100: ""
+ PUBLISH: "no"
workflow:
rules:
@@ -29,36 +29,42 @@ workflow:
auto_cancel:
on_new_commit: none
- # For manual pipelines
+ # For manual pipelines (GitLab UI)
- if: $CI_PIPELINE_SOURCE == "web"
+ # For pipelines created via the REST API (personal access token)
+ - if: $CI_PIPELINE_SOURCE == "api"
+
+ # For trigger pipelines
+ - if: $CI_PIPELINE_SOURCE == "trigger"
+
# For push to main
- if: $CI_PIPELINE_SOURCE == 'push' && ($CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH =~ /^core_/)
variables:
- UNIT_TEST: 'no'
- INTEGRATION_TEST: 'no'
- FUNCTIONAL_TEST: 'yes'
+ UNIT_TEST: "no"
+ INTEGRATION_TEST: "no"
+ FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_REPEAT: 5
- FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no'
+ FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
FUNCTIONAL_TEST_TIME_LIMIT: 3600
- CLUSTER_A100: ''
- CLUSTER_H100: ''
- PUBLISH: 'no'
+ CLUSTER_A100: ""
+ CLUSTER_H100: ""
+ PUBLISH: "no"
auto_cancel:
on_new_commit: interruptible
# For merge-trains that need to be fast-tracked
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/
variables:
- UNIT_TEST: 'yes'
+ UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
- INTEGRATION_TEST: 'no'
- FUNCTIONAL_TEST: 'no'
- CLUSTER_A100: ''
- CLUSTER_H100: ''
- PUBLISH: 'no'
+ INTEGRATION_TEST: "no"
+ FUNCTIONAL_TEST: "no"
+ CLUSTER_A100: ""
+ CLUSTER_H100: ""
+ PUBLISH: "no"
# For normal merge-trains
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train'
@@ -67,75 +73,75 @@ workflow:
# For MRs with integration suite
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run tests/
variables:
- UNIT_TEST: 'yes'
+ UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
- INTEGRATION_TEST: 'yes'
+ INTEGRATION_TEST: "yes"
INTEGRATION_TEST_SCOPE: mr
- FUNCTIONAL_TEST: 'no'
+ FUNCTIONAL_TEST: "no"
FUNCTIONAL_TEST_SCOPE: mr-slim
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_TIME_LIMIT: 2700
- CLUSTER_A100: ''
- CLUSTER_H100: ''
- PUBLISH: 'no'
+ CLUSTER_A100: ""
+ CLUSTER_H100: ""
+ PUBLISH: "no"
# For MRs with nightly
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run nightly/
variables:
- UNIT_TEST: 'yes'
+ UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
- INTEGRATION_TEST: 'no'
- FUNCTIONAL_TEST: 'yes'
+ INTEGRATION_TEST: "no"
+ FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: nightly
FUNCTIONAL_TEST_REPEAT: 5
- FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no'
+ FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
FUNCTIONAL_TEST_TIME_LIMIT: 2700
- CLUSTER_A100: ''
- CLUSTER_H100: ''
- PUBLISH: 'no'
+ CLUSTER_A100: ""
+ CLUSTER_H100: ""
+ PUBLISH: "no"
# For MRs with weekly
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run weekly/
variables:
- UNIT_TEST: 'yes'
+ UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
- INTEGRATION_TEST: 'no'
- FUNCTIONAL_TEST: 'yes'
+ INTEGRATION_TEST: "no"
+ FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: weekly
FUNCTIONAL_TEST_REPEAT: 1
- FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no'
+ FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
FUNCTIONAL_TEST_TIME_LIMIT: 9000
- CLUSTER_A100: ''
- CLUSTER_H100: ''
- PUBLISH: 'no'
+ CLUSTER_A100: ""
+ CLUSTER_H100: ""
+ PUBLISH: "no"
# For MRs with heavy suite
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run functional tests/
variables:
- UNIT_TEST: 'yes'
+ UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
- INTEGRATION_TEST: 'no'
- FUNCTIONAL_TEST: 'yes'
+ INTEGRATION_TEST: "no"
+ FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_TIME_LIMIT: 2700
- CLUSTER_A100: ''
- CLUSTER_H100: ''
- PUBLISH: 'no'
+ CLUSTER_A100: ""
+ CLUSTER_H100: ""
+ PUBLISH: "no"
# Default MRs
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result'
variables:
- UNIT_TEST: 'yes'
+ UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
- INTEGRATION_TEST: 'no'
- FUNCTIONAL_TEST: 'no'
- PUBLISH: 'no'
+ INTEGRATION_TEST: "no"
+ FUNCTIONAL_TEST: "no"
+ PUBLISH: "no"
- when: never
@@ -157,109 +163,109 @@ default:
variables:
BUILD:
- value: 'yes'
+ value: "yes"
UNIT_TEST:
- value: 'yes'
+ value: "yes"
options:
- - 'yes'
- - 'no'
+ - "yes"
+ - "no"
description: To run the funtional test suite
UNIT_TEST_REPEAT:
- value: '1'
- description: 'Number of repetitions'
+ value: "1"
+ description: "Number of repetitions"
UNIT_TEST_TIMEOUT:
- value: '30'
+ value: "30"
description: Timeout (minutes) for Unit tests (all repeats)
INTEGRATION_TEST:
- value: 'yes'
+ value: "yes"
options:
- - 'yes'
- - 'no'
+ - "yes"
+ - "no"
description: To run the integration test suite
INTEGRATION_TEST_SCOPE:
- value: 'mr'
+ value: "mr"
options:
- - 'mr'
- - 'nightly'
- - 'weekly'
- - 'pre-release'
- - 'release'
- description: 'Testsuite to run (only for INTEGRATION_TEST=yes)'
+ - "mr"
+ - "nightly"
+ - "weekly"
+ - "pre-release"
+ - "release"
+ description: "Testsuite to run (only for INTEGRATION_TEST=yes)"
INTEGRATION_TEST_TIME_LIMIT:
- value: '900'
- description: 'Timeout in seconds per test'
+ value: "900"
+ description: "Timeout in seconds per test"
INTEGRATION_TEST_CASES:
- value: 'all'
+ value: "all"
description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
FUNCTIONAL_TEST:
- value: 'yes'
+ value: "yes"
options:
- - 'yes'
- - 'no'
+ - "yes"
+ - "no"
description: To run the funtional test suite
FUNCTIONAL_TEST_SCOPE:
- value: 'mr'
+ value: "mr"
options:
- - 'mr'
- - 'nightly'
- - 'weekly'
- - 'pre-release'
- - 'release'
- description: 'Testsuite to run (only for FUNCTIONAL_TEST=yes)'
+ - "mr"
+ - "nightly"
+ - "weekly"
+ - "pre-release"
+ - "release"
+ description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
FUNCTIONAL_TEST_REPEAT:
- value: '5'
- description: 'Number of repetitions per test'
+ value: "5"
+ description: "Number of repetitions per test"
FUNCTIONAL_TEST_TIME_LIMIT:
- value: '2700'
- description: 'Timeout in seconds per test'
+ value: "2700"
+ description: "Timeout in seconds per test"
FUNCTIONAL_TEST_CASES:
- value: 'all'
+ value: "all"
description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
FUNCTIONAL_TEST_NAME:
- description: 'Name of functional test run (only for pre-release and release)'
- value: '$$CI_COMMIT_SHA'
+ description: "Name of functional test run (only for pre-release and release)"
+ value: "$$CI_COMMIT_SHA"
FUNCTIONAL_TEST_RECORD_CHECKPOINTS:
- value: 'no'
- description: 'Record golden checkpoints'
+ value: "no"
+ description: "Record golden checkpoints"
options:
- - 'yes'
- - 'no'
+ - "yes"
+ - "no"
CLUSTER_A100:
- value: 'dgxa100_dracooci'
+ value: "dgxa100_dracooci"
options:
- - 'dgxa100_dracooci'
- - 'dgxa100_dracooci-ord'
- description: 'Cluster for A100 workloads'
+ - "dgxa100_dracooci"
+ - "dgxa100_dracooci-ord"
+ description: "Cluster for A100 workloads"
CLUSTER_H100:
- value: 'dgxh100_coreweave'
+ value: "dgxh100_coreweave"
options:
- - 'dgxh100_coreweave'
- - 'dgxh100_eos'
- description: 'Cluster for H100 workloads'
+ - "dgxh100_coreweave"
+ - "dgxh100_eos"
+ description: "Cluster for H100 workloads"
CLUSTER_GB200:
- value: 'dgxgb200_oci-hsg'
+ value: "dgxgb200_oci-hsg"
options:
- - 'dgxgb200_oci-hsg'
- description: 'Cluster for H100 workloads'
+ - "dgxgb200_oci-hsg"
+ description: "Cluster for H100 workloads"
PUBLISH:
- value: 'no'
+ value: "no"
options:
- - 'yes'
- - 'no'
+ - "yes"
+ - "no"
description: Build and publish a wheel to PyPi
PUBLISH_COMMIT:
- value: '$$CI_COMMIT_SHA'
+ value: "$$CI_COMMIT_SHA"
description: Which commit to publish
PUBLISH_VERSION_BUMP_BRANCH:
- value: '$$CI_COMMIT_BRANCH'
+ value: "$$CI_COMMIT_BRANCH"
description: Which branch to target for version bump
PUBLISH_SCOPE:
- value: 'code-freeze'
+ value: "code-freeze"
options:
- - 'code-freeze'
- - 'release'
- - 'review-reminder'
- - 'upgrade-dependencies'
+ - "code-freeze"
+ - "release"
+ - "review-reminder"
+ - "upgrade-dependencies"
description: Type of publish (freeze or final release)
# CI wide variables
@@ -267,7 +273,7 @@ variables:
CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility
- TE_GIT_REF: ''
+ TE_GIT_REF: ""
include:
- .gitlab/stages/00.pre.yml
diff --git a/.gitlab/stages/01.build.yml b/.gitlab/stages/01.build.yml
index 61521295a93..f5a80c4074d 100644
--- a/.gitlab/stages/01.build.yml
+++ b/.gitlab/stages/01.build.yml
@@ -64,12 +64,12 @@ test:pre_build_image:
- IMAGE: CI_MCORE_DEV_IMAGE
FILE: Dockerfile.ci.dev
IMAGE_TYPE: dev
- BASE_IMAGE: nvcr.io/nvidia/pytorch:25.11-py3
+ BASE_IMAGE: nvcr.io/nvidia/pytorch:26.02-py3
PLATFORM: amd64
- IMAGE: CI_MCORE_DEV_IMAGE
FILE: Dockerfile.ci.dev
IMAGE_TYPE: dev
- BASE_IMAGE: nvcr.io/nvidia/pytorch:25.11-py3
+ BASE_IMAGE: nvcr.io/nvidia/pytorch:26.02-py3
PLATFORM: arm64
- IMAGE: UTILITY_IMAGE
FILE: Dockerfile.linting
diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 00000000000..aa37017f082
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,14 @@
+comment: false
+coverage:
+ status:
+ project: false
+ patch:
+ default:
+ target: 80%
+ threshold: 5%
+ base: auto
+ if_ci_failed: error
+ if_no_uploads: success
+ if_not_found: success
+fixes:
+ - "/opt/megatron-lm/::"
diff --git a/docs/api-backwards-compatibility-check.md b/docs/api-backwards-compatibility-check.md
index e1b6939b06f..40f56ec0c00 100644
--- a/docs/api-backwards-compatibility-check.md
+++ b/docs/api-backwards-compatibility-check.md
@@ -1,3 +1,7 @@
+---
+orphan: true
+---
+
----
-orphan: true
----
-
# Documentation Development
- [Documentation Development](#documentation-development)
diff --git a/docs/get-started/install.md b/docs/get-started/install.md
index e1d7202b3fc..5781d065fae 100644
--- a/docs/get-started/install.md
+++ b/docs/get-started/install.md
@@ -7,90 +7,117 @@
license agreement from NVIDIA CORPORATION is strictly prohibited.
-->
-# Megatron Core Installation
-
-Installation is supported using Docker and pip.
+# Installation
## System Requirements
-### Hardware Requirements
+### Hardware
-- **FP8 Support**: NVIDIA Hopper, Ada, Blackwell GPUs
- **Recommended**: NVIDIA Turing architecture or later
+- **FP8 Support**: Requires NVIDIA Hopper, Ada, or Blackwell GPUs
-### Software Requirements
+### Software
-- **CUDA/cuDNN/NCCL**: Latest stable versions
-- **PyTorch**: Latest stable version
-- **Transformer Engine**: Latest stable version
-- **Python**: 3.12 recommended
+- **Python**: >= 3.10 (3.12 recommended)
+- **PyTorch**: >= 2.6.0
+- **CUDA Toolkit**: Latest stable version
-## Docker Installation (Recommended)
+## Prerequisites
-We strongly recommend using the previous releases of [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) rather than the latest one for optimal compatibility with Megatron Core release and testing matrix. Our releases are always based on the previous month's NGC container, so this ensures compatibility and stability.
+Install [uv](https://docs.astral.sh/uv/), a fast Python package installer:
-**Note:** The NGC PyTorch container constraints the python environment globally via `PIP_CONSTRAINT`. In the following examples we will unset the variable.
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
-This container comes with all dependencies pre-installed with compatible versions and optimized configurations for NVIDIA GPUs:
-- PyTorch (latest stable version)
-- CUDA, cuDNN, NCCL (latest stable versions)
-- Support for FP8 on NVIDIA Hopper, Ada, and Blackwell GPUs
-- For best performance, use NVIDIA Turing GPU architecture generations and later
+## Option A: Pip Install (Recommended)
+
+Install the latest stable release from PyPI:
```bash
-# Run container with mounted directories
-docker run --runtime --nvidia --gpus all -it --rm \
- -v /path/to/megatron:/workspace/megatron \
- -v /path/to/dataset:/workspace/dataset \
- -v /path/to/checkpoints:/workspace/checkpoints \
- -e PIP_CONSTRAINT= \
- nvcr.io/nvidia/pytorch:25.04-py3
+uv pip install megatron-core
```
-## Pip Installation
+To include optional training dependencies (Weights & Biases, SentencePiece, HF Transformers):
+
+```bash
+uv pip install "megatron-core[training]"
+```
-Megatron Core installation offers support for two NGC PyTorch containers:
+For all extras including [Transformer Engine](https://github.com/NVIDIA/TransformerEngine):
-- `dev`: Moving head that supports the most recent upstream dependencies
-- `lts`: Long-term support of NGC PyTorch 24.01
+```bash
+uv pip install --group build
+uv pip install --no-build-isolation "megatron-core[training,dev]"
+```
-Both containers can be combined with `mlm`, which adds package dependencies for Megatron-LM on top of Megatron Core.
+```{note}
+`--no-build-isolation` requires build dependencies to be pre-installed in the environment. `torch` is needed because several `[dev]` packages (`mamba-ssm`, `nv-grouped-gemm`, `transformer-engine`) import it at build time to compile CUDA kernels. Expect this step to take **20+ minutes** depending on your hardware. If you prefer pre-built binaries, the [NGC Container](#option-c-ngc-container) ships with these pre-compiled.
+```
+```{warning}
+Building from source can consume a large amount of memory. By default the build runs one compiler job per CPU core, which may cause out-of-memory failures on machines with many cores. To limit parallel compilation jobs, set the `MAX_JOBS` environment variable before installing (e.g. `MAX_JOBS=4`).
+```
-1. Install the latest release dependencies
+```{tip}
+For a lighter set of development dependencies without Transformer Engine and ModelOpt, use `[lts]` instead of `[dev]`: `uv pip install --no-build-isolation "megatron-core[training,lts]"`. The `[lts]` and `[dev]` extras are mutually exclusive.
+```
- ```bash
- pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
- pip install --no-build-isolation megatron-core[dev]
- ```
+To clone the repository for examples:
-2. Next choose one of the following options:
+```bash
+git clone https://github.com/NVIDIA/Megatron-LM.git
+```
-* For running an Megatron LM application
- ```bash
- pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
- pip install --no-build-isolation megatron-core[mlm,dev]
- ```
-* Install packages for LTS support NGC PyTorch 24.01
+## Option B: Install from Source
- ```bash
- pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
- pip install --no-build-isolation megatron-core[lts]
- ```
+For development or to run the latest unreleased code:
-* For running an Megatron LM application
+```bash
+git clone https://github.com/NVIDIA/Megatron-LM.git
+cd Megatron-LM
+uv pip install -e .
+```
- ```bash
- pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
- pip install --no-build-isolation megatron-core[mlm,lts]
- ```
+To install with all development dependencies (includes Transformer Engine, requires pre-installed build deps):
-* For a version of Megatron Core with only Torch, run
+```bash
+uv pip install --group build
+uv pip install --no-build-isolation -e ".[training,dev]"
+```
+
+```{tip}
+If the build runs out of memory, limit parallel compilation jobs with `MAX_JOBS=4 uv pip install --no-build-isolation -e ".[training,dev]"`.
+```
+
+
+## Option C: NGC Container
+
+For a pre-configured environment with all dependencies pre-installed (PyTorch, CUDA, cuDNN, NCCL, Transformer Engine), use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch).
+
+We recommend using the **previous month's** NGC container rather than the latest one to ensure compatibility with the current Megatron Core release and testing matrix.
+
+```bash
+docker run --gpus all -it --rm \
+ -v /path/to/dataset:/workspace/dataset \
+ -v /path/to/checkpoints:/workspace/checkpoints \
+ -e PIP_CONSTRAINT= \
+ nvcr.io/nvidia/pytorch:26.01-py3
+```
+
+```{note}
+The NGC PyTorch container constrains the Python environment globally via `PIP_CONSTRAINT`. The `-e PIP_CONSTRAINT=` flag above unsets this so that Megatron Core and its dependencies install correctly.
+```
+
+Then install Megatron Core inside the container (torch is already available in the NGC image):
+
+```bash
+pip install uv
+uv pip install --no-build-isolation "megatron-core[training,dev]"
+```
- ```bash
- pip install megatron-core
- ```
+You are now ready to run training. See [Your First Training Run](quickstart.md) for next steps.
diff --git a/docs/get-started/overview.md b/docs/get-started/overview.md
index 42b964d5cec..b7f84ee22e5 100644
--- a/docs/get-started/overview.md
+++ b/docs/get-started/overview.md
@@ -87,7 +87,7 @@ After training or modifying a Megatron model, you can convert it again for deplo
- **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
- **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
- **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
-- **[Model Optimizer (ModelOpt)](https://github.com/NVIDIA/Model-Optimizer)** - Model optimization toolkit for quantization, pruning, distillation, speculative decoding, and more. Checkout end-to-end examples in [examples/post_training/modelopt](./examples/post_training/modelopt/).
+- **[Model Optimizer (ModelOpt)](https://github.com/NVIDIA/Model-Optimizer)** - Model optimization toolkit for quantization, pruning, distillation, speculative decoding, and more. Checkout end-to-end examples in [examples/post_training/modelopt](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt).
**Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
diff --git a/docs/get-started/quickstart.md b/docs/get-started/quickstart.md
index 2addcb519a2..c8797aeedd4 100644
--- a/docs/get-started/quickstart.md
+++ b/docs/get-started/quickstart.md
@@ -7,54 +7,40 @@
license agreement from NVIDIA CORPORATION is strictly prohibited.
-->
-# Quick Start
+# Your First Training Run
-## Quick Installation
+This guide walks you through running your first training jobs with Megatron Core. Make sure you have completed [installation](install.md) before proceeding.
-Install Megatron Core with pip:
+## Simple Training Example
-1. Install Megatron Core with required dependencies:
-
- ```bash
- pip install --no-build-isolation megatron-core[mlm,dev]
- ```
-
-2. Clone repository for examples:
-
- ```bash
- git clone https://github.com/NVIDIA/Megatron-LM.git
- cd Megatron-LM
- pip install --no-build-isolation .[mlm,dev]
- ```
-
-That's it! You're ready to start training.
-
-## Your First Training Run
-
-### Simple Training Example
+Run a minimal distributed training loop with mock data on 2 GPUs:
```bash
-# Distributed training example (2 GPUs, mock data)
torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py
```
-### LLaMA-3 Training Example
+## LLaMA-3 Training Example
+
+Train a LLaMA-3 8B model with FP8 precision on 8 GPUs using mock data:
```bash
-# 8 GPUs, FP8 precision, mock data
-./examples/llama/train_llama3_8b_fp8.sh
+./examples/llama/train_llama3_8b_h100_fp8.sh
```
## Data Preparation
-### JSONL Data Format
+To train on your own data, Megatron expects preprocessed binary files (`.bin` and `.idx`).
+
+### 1. Prepare a JSONL File
+
+Each line should contain a `text` field:
```json
{"text": "Your training text here..."}
{"text": "Another training sample..."}
```
-### Basic Preprocessing
+### 2. Preprocess the Data
```bash
python tools/preprocess_data.py \
diff --git a/docs/index.md b/docs/index.md
index c68f608a73b..4b75ed2c0c8 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -38,8 +38,8 @@ get-started/releasenotes
:hidden:
:caption: Get Started
-get-started/quickstart
get-started/install
+get-started/quickstart
```
```{toctree}
@@ -71,6 +71,7 @@ user-guide/features/custom_fsdp
user-guide/features/dist_optimizer
user-guide/features/optimizer_cpu_offload
user-guide/features/pipeline_parallel_layout
+user-guide/features/fine_grained_activation_offloading
user-guide/features/megatron_energon
user-guide/features/megatron_rl
user-guide/features/tokenizers
diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md
index 076409cd4f5..95568adce78 100644
--- a/docs/llama_mistral.md
+++ b/docs/llama_mistral.md
@@ -11,7 +11,7 @@
NOTE: In order to simplify code we now only support converting llama-3.x and mistral checkpoints downloaded from Hugging Face. For converting other models, see [Megatron Bridge](models/index.md).
-The [Llama-2](https://ai.meta.com/llama/) and [Llama-3.x](https://llama.meta.com/) family of models are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At their times of release, both Llama-2 and Llama-3 models achieved among the best results for open-source models, and were competitive with leading closed-source models (see https://arxiv.org/pdf/2307.09288.pdf and https://ai.meta.com/blog/meta-llama-3/).
+The Llama-2 and Llama-3.x family of models are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At their times of release, both Llama-2 and Llama-3 models achieved among the best results for open-source models, and were competitive with leading closed-source models (see ).
Similarly, [Mistral-7b](https://mistral.ai/news/announcing-mistral-7b/) is an open-source model with pretrained and finetuned (for chat) variants that achieve strong benchmark results.
@@ -50,7 +50,6 @@ Architecturally Llama-2, Llama-3 and Mistral-7b are very similar. As such Megatr
- [Known numerical differences](#known-numerical-differences)
- [Using legacy model format](#using-legacy-model-format)
-
# Llama-2
Llama-2 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of three steps:
@@ -63,7 +62,7 @@ The following sections detail these steps. The final section lists benchmark res
## Download Meta or Huggingface checkpoints
-Users must first apply for access to download the Llama-2 checkpoints either directly from [Meta](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or through [Huggingface](https://huggingface.co/docs/transformers/main/model_doc/llama2) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next.
+Users must first apply for access to download the Llama-2 checkpoints either directly [Huggingface](https://huggingface.co/docs/transformers/main/model_doc/llama2) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next.
## Convert checkpoint format
@@ -149,11 +148,11 @@ If loading for either inference or finetuning, use the following arguments:
### Launch Meta
-Meta checkpoints can be launched with: https://github.com/facebookresearch/llama
+Meta checkpoints can be launched with:
### Launch Huggingface
-Huggingface checkpoints can be launched with: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+Huggingface checkpoints can be launched with:
## Benchmark results
@@ -361,7 +360,7 @@ The following sections detail these steps.
## Download Huggingface checkpoints
-Users must first apply for access to download the Mistral-7b checkpoints through [Huggingface](https://huggingface.co/mistralai/Mistral-7B-v0.3) (HF).
+Users must first apply for access to download the Mistral-7b checkpoints through Huggingface. Two variants are available: the base model ([Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3)) and the instruct model ([Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)).
## Convert checkpoint format
@@ -437,7 +436,7 @@ Many models such as Yi-34B and Qwen2.x use the Llama architecture and may be con
It is not expected that the megatron and Huggingface implementations of llama3.x and mistral models will produce numerically identical results. There are multiple points where small numerical differences are expected. This is a non-exhaustive list:
-1. TransformerEngine (TE) uses the model params_dtype inside RMSNorm whereas the Huggingface implementation uses fp32. See for details: https://github.com/NVIDIA/TransformerEngine/issues/1132
+1. TransformerEngine (TE) uses the model params_dtype inside RMSNorm whereas the Huggingface implementation uses fp32. See for details:
2. Huggingface `transformers` implements the q, k and v projections in self-attention as separate GEMMs whereas Megatron core combines them into a single GEMM for efficiency. This leads to small numerical differences.
# Using legacy model format
diff --git a/docs/user-guide/data-preparation.md b/docs/user-guide/data-preparation.md
index 18da2d80fe1..ea91bee4309 100644
--- a/docs/user-guide/data-preparation.md
+++ b/docs/user-guide/data-preparation.md
@@ -46,6 +46,46 @@ python tools/preprocess_data.py \
| `--workers` | Number of parallel workers for processing |
| `--append-eod` | Add end-of-document token |
+## Finding Optimal Number of Workers
+
+Use the `--find-optimal-num-workers` flag to find number of workers which gives the best performance in terms of preprocessed documents per second.
+Script will lauch a few short data preprocessing runs with a different number of workers to define the fastest run in respect to collected performance data.
+
+```bash
+python tools/preprocess_data.py \
+ --input data.jsonl \
+ --output-prefix processed_data \
+ --tokenizer-type HuggingFaceTokenizer \
+ --tokenizer-model /path/to/tokenizer.model \
+ --workers 8 \
+ --find-optimal-num-workers \
+ --workers-to-check 4 8 16 32 \
+ --max-documents 50000
+```
+
+**Required arguments**
+
+| Argument | Description |
+|----------|-------------|
+| `--find-optimal-num-workers` | Activates search of optimal number of workers |
+| `--workers-to-check` | List of possible number of workers to run |
+| `--max-documents` | Number of documents to be preprocessed during each run |
+
+**Output example**
+
+```bash
+-----------------------------------
+Performance results (fastest → slowest):
+1. 16 workers → avg. docs/s: 9606.6476
+2. 32 workers → avg. docs/s: 9275.3284
+3. 8 workers → avg. docs/s: 9151.9280
+4. 4 workers → avg. docs/s: 6391.3819
+
+-----------------------------------
+The most optimal num of workers is 16 with avg. preprocessed docs/s: 9606.6476.
+-----------------------------------
+```
+
## Output Files
The preprocessing tool generates two files:
diff --git a/docs/user-guide/features/index.md b/docs/user-guide/features/index.md
index fc5a1aa1abe..59cef95d574 100644
--- a/docs/user-guide/features/index.md
+++ b/docs/user-guide/features/index.md
@@ -14,6 +14,7 @@ Advanced feature guides for key Megatron Core capabilities.
```{toctree}
:maxdepth: 2
+fine_grained_activation_offloading
moe
context_parallel
custom_fsdp
diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md
index 45e70c3a520..d12f3e35af2 100644
--- a/docs/user-guide/index.md
+++ b/docs/user-guide/index.md
@@ -1,3 +1,7 @@
+---
+orphan: true
+---
+