diff --git a/.github/actions/action.yml b/.github/actions/action.yml
index decaa5ff3f8..7e2faf936b4 100644
--- a/.github/actions/action.yml
+++ b/.github/actions/action.yml
@@ -59,7 +59,7 @@ runs:
       run: echo "node_name=$NODE_NAME" | tee -a "$GITHUB_OUTPUT"
 
     - name: Checkout repository
-      uses: actions/checkout@v2
+      uses: actions/checkout@v6
 
     - name: Change ownership of /home/runner/
       shell: bash
@@ -98,7 +98,8 @@ runs:
           --environment dev \
           --platform dgx_h100 \
           --tag ${{ inputs.tag }} \
-          --container-image ${{ inputs.container-image }}
+          --container-image ${{ inputs.container-image }} \
+          --hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME
 
         RUN_TEST_EOF
         )
@@ -186,6 +187,7 @@ runs:
           --platform dgx_h100 \
           --container-image ${{ inputs.container-image }} \
           --data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \
+          --hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME
 
         RUN_TEST_EOF
         )
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
index fed1792d2f9..8c1ef48cf00 100644
--- a/.github/copy-pr-bot.yaml
+++ b/.github/copy-pr-bot.yaml
@@ -1,4 +1,4 @@
 enabled: true
 auto_sync_draft: false
 auto_sync_ready: true
-trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "parthmannan", "prajwal1210", "pthombre", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
+trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "ksivaman", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
diff --git a/.github/oncall_schedule.json b/.github/oncall_schedule.json
index 58fcf0ddbbc..fe90397c2ad 100644
--- a/.github/oncall_schedule.json
+++ b/.github/oncall_schedule.json
@@ -1,16 +1,4 @@
 [
-    {
-        "user": "janEbert",
-        "date": "2026-02-18"
-    },
-    {
-        "user": "asolergi-nv",
-        "date": "2026-02-25"
-    },
-    {
-        "user": "BoxiangW",
-        "date": "2026-03-04"
-    },
     {
         "user": "maanug-nv",
         "date": "2026-03-11"
@@ -20,31 +8,43 @@
         "date": "2026-03-18"
     },
     {
-        "user": "gautham-kollu",
+        "user": "janEbert",
         "date": "2026-03-25"
     },
     {
-        "user": "janEbert",
+        "user": "gautham-kollu",
         "date": "2026-04-01"
     },
     {
-        "user": "maanug-nv",
+        "user": "ilml",
         "date": "2026-04-08"
     },
     {
-        "user": "BoxiangW",
+        "user": "Phlip79",
         "date": "2026-04-15"
     },
     {
-        "user": "Phlip79",
+        "user": "asolergi-nv",
         "date": "2026-04-22"
     },
     {
-        "user": "asolergi-nv",
+        "user": "BoxiangW",
         "date": "2026-04-29"
     },
     {
-        "user": "dimapihtar",
+        "user": "maanug-nv",
         "date": "2026-05-06"
+    },
+    {
+        "user": "dimapihtar",
+        "date": "2026-05-13"
+    },
+    {
+        "user": "gautham-kollu",
+        "date": "2026-05-20"
+    },
+    {
+        "user": "ilml",
+        "date": "2026-05-27"
     }
 ]
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 5cd5138eb69..d2825f9c34b 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -5,19 +5,8 @@
 
 ## Contribution process
 
-```mermaid
-flowchart LR
-    A[Pre-checks] --> B[PR Tests]
-    subgraph Code Review/Approval
-        C1[Expert Review] --> C2[Final Review]
-    end
-    B --> C1
-    C2 --> D[Merge]
-```
-
 ### Pre-checks
 
-- [ ] I want this PR in a versioned release and have added the appropriate Milestone (e.g., `Core 0.8`)
 - [ ] I have added relevant unit tests
 - [ ] I have added relevant functional tests
 - [ ] I have added proper typing to my code [Typing guidelines](https://docs.python.org/3/library/typing.html)
@@ -26,33 +15,32 @@ flowchart LR
 
 ### Code review
 
-The following process is enforced via the CODEOWNERS file for changes into `megatron/core`. For changes outside of `megatron/core`, it is up to the PR author whether or not to tag the Final Reviewer team.
+Feel free to message or comment the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) to help accelerate your merge into main. The less complex your PR is, the faster it will be approved and merged!
 
-<details>
-<summary>For MRs into `main` branch</summary>
+All PRs start as **draft**. If you open a non-draft PR, it will be automatically converted to draft.
 
-Feel free to message or comment the @mcore-oncall to help accelerate your merge into main. The less complex your PR is, the faster it will be approved and merged!
+#### Step 1: Mark PR as "Ready for Review"
 
-#### (Step 1): Add PR label `Expert Review`
+1. When your PR is ready, click **Ready for Review**.
+2. An oncall reviewer is auto-assigned and expert reviewers are notified based on your changes.
+   - Some PRs may jump straight to step 2. This is determined by `.github/CODEOWNERS`.
 
-#### (Step 2): Collect the expert reviewers reviews
+:warning: Only mark as ready once merge-conflicts are resolved and the CI is passing.
+Final Review might get declined if these requirements are not fulfilled.
 
-1. Attach the `Expert Review` label when your PR is ready for review.
-2. GitHub auto-assigns expert reviewers based on your changes. They will get notified and pick up your PR soon.
+#### Step 2: Final Review
 
-:warning: Only proceed to the next step once all reviewers have approved, merge-conflict are resolved and the CI is passing.  
-Final Review might get declined if these requirements are not fulfilled.
+For PRs that change `megatron/core`, once all expert reviewers have approved, the `Final Review` label is applied **automatically** and final reviewers are assigned.
 
-#### (Step 3): Final Review
+For PRs outside `megatron/core`, this step is skipped.
 
-1. Add `Final Review` label
-2. GitHub auto-assigns final reviewers based on your changes. They will get notified and pick up your PR soon.
+#### Step 3: Approved
 
-#### (Optional Step 4): Cherry-pick into release branch
+Once all required reviewers have approved, the `Approved` label is applied **automatically**.
 
-If this PR also needs to be merged into `core_r*` release branches, after this PR has been merged, select `Cherry-pick` to open a new PR into the release branch.
+### Merge
 
-</details>
+Any member of [mcore-engineers](https://github.com/orgs/NVIDIA/teams/mcore-engineers) will be able to merge your PR.
 
 <details>
 <summary>For MRs into `dev` branch</summary>
@@ -60,7 +48,3 @@ The proposed review process for `dev` branch is under active discussion.
 
 MRs are mergable after one approval by either `eharper@nvidia.com` or `zijiey@nvidia.com`.
 </details>
-
-### Merging your PR
-
-Any member of [core-adlr](https://github.com/orgs/teams/NVIDIA/core-adlr) and [`core-nemo`](https://github.com/orgs/teams/NVIDIA/core-nemo) will be able to merge your PR.
diff --git a/.github/workflows/_build_test_publish_wheel.yml b/.github/workflows/_build_test_publish_wheel.yml
index 0b71577b587..f29488c4494 100644
--- a/.github/workflows/_build_test_publish_wheel.yml
+++ b/.github/workflows/_build_test_publish_wheel.yml
@@ -43,7 +43,7 @@ jobs:
       PUBLISH_DRYRUN: ${{ inputs.dry-run }}
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           ref: ${{ inputs.ref }}
 
@@ -136,7 +136,7 @@ jobs:
           test "${{ steps.build-wheel.outputs.expected-release-number }}" == "$RELEASE_NUMBER"
 
       - name: Upload wheels
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
           path: dist/
@@ -159,7 +159,7 @@ jobs:
       PACKAGE: ${{ matrix.PACKAGE }}
     steps:
       - name: Download wheels
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
         with:
           name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
           path: dist/
diff --git a/.github/workflows/_release_library.yml b/.github/workflows/_release_library.yml
index 684dacc27aa..46de1bb3a66 100644
--- a/.github/workflows/_release_library.yml
+++ b/.github/workflows/_release_library.yml
@@ -53,6 +53,11 @@ on:
         description: Starting tag for changelog builder (leave empty for auto-detect)
         type: string
         default: ""
+      publish-docs:
+        required: false
+        description: Publish documentation to S3 after release
+        type: boolean
+        default: true
     secrets:
       TWINE_PASSWORD:
         required: true
@@ -60,6 +65,22 @@ on:
         required: true
       PAT:
         required: true
+      AWS_ASSUME_ROLE_ARN:
+        required: true
+      AWS_ACCESS_KEY_ID:
+        required: true
+      AWS_SECRET_ACCESS_KEY:
+        required: true
+      AKAMAI_HOST:
+        required: true
+      AKAMAI_CLIENT_TOKEN:
+        required: true
+      AKAMAI_CLIENT_SECRET:
+        required: true
+      AKAMAI_ACCESS_TOKEN:
+        required: true
+      S3_BUCKET_NAME:
+        required: true
 
 permissions:
   contents: write # To read repository content
@@ -89,7 +110,7 @@ jobs:
       IS_DRY_RUN: ${{ inputs.dry-run }}
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           path: ${{ github.run_id }}
           token: ${{ secrets.PAT }}
@@ -199,11 +220,8 @@ jobs:
           # Extract PR number from URL
           PR_NUMBER=$(echo $PR_URL | grep -o '[0-9]*$')
 
-          # Add comment to the newly created PR
-          echo gh pr comment $PR_NUMBER --body "/ok to test $(git rev-parse HEAD)"
-
       - name: Wait for status checks on tmp branch
-        uses: actions/github-script@v7
+        uses: actions/github-script@v8
         id: wait-status
         with:
           github-token: ${{ secrets.PAT }}
@@ -326,7 +344,6 @@ jobs:
       ref: ${{ inputs.release-ref }}
       no-publish: false
     secrets:
-      TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
 
   create-gh-release:
@@ -344,10 +361,10 @@ jobs:
       REPOSITORY: ${{ github.repository }}
       PROJECT_NAME: Megatron Core
       VERSION: ${{ needs.bump-next-version.outputs.release-version }}
-      TAG_PREFIX: ${{ inputs.gh-release-tag-prefix || '' }}
+      TAG_PREFIX: core_
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           path: ${{ github.run_id }}
           ref: ${{ inputs.release-ref }}
@@ -455,6 +472,12 @@ jobs:
   publish-docs:
     needs: [bump-next-version, create-gh-release]
     uses: ./.github/workflows/release-docs.yml
+    if: |
+      (
+        success() || !failure()
+      )
+      && inputs.publish-docs == true
+      && !cancelled()
     with:
       dry-run: ${{ inputs.dry-run }}
       publish-as-latest: true
@@ -472,7 +495,7 @@ jobs:
       VERSION: ${{ needs.build-test-publish-wheels.outputs.version }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           repository: NVIDIA-NeMo/FW-CI-templates
           ref: v0.17.0
diff --git a/.github/workflows/_update_dependencies.yml b/.github/workflows/_update_dependencies.yml
index a60e69f701b..903d773edbd 100644
--- a/.github/workflows/_update_dependencies.yml
+++ b/.github/workflows/_update_dependencies.yml
@@ -33,7 +33,7 @@ jobs:
       TARGET_BRANCH: ${{ inputs.target-branch }}
     steps:
       - name: Checkout repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           ref: ${{ env.TARGET_BRANCH }}
 
@@ -60,7 +60,7 @@ jobs:
           fi
 
       - name: Checkout repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           ref: ${{ env.SOURCE_BRANCH }}
 
@@ -77,7 +77,7 @@ jobs:
           bash -c 'uv lock --upgrade'
 
       - name: Upload lock file
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: lock-file-${{ env.SOURCE_BRANCH }}
           path: uv.lock
@@ -90,7 +90,7 @@ jobs:
       TARGET_BRANCH: ${{ inputs.target-branch }}
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           token: ${{ secrets.PAT }}
           ref: ${{ env.TARGET_BRANCH }}
@@ -103,7 +103,7 @@ jobs:
           fi
 
       - name: Download lock file
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
         with:
           name: lock-file-${{ env.SOURCE_BRANCH }}
 
diff --git a/.github/workflows/auto-reminder-bot.yml b/.github/workflows/auto-reminder-bot.yml
index 37e6e5498e3..72a48e9539e 100644
--- a/.github/workflows/auto-reminder-bot.yml
+++ b/.github/workflows/auto-reminder-bot.yml
@@ -14,10 +14,10 @@ jobs:
     if: github.repository == 'NVIDIA/Megatron-LM'
     steps:
       - name: Check out repository code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: "3.10"
 
diff --git a/.github/workflows/auto-swap-labels.yml b/.github/workflows/auto-swap-labels.yml
index 5335026e2af..f1dd9757c8a 100644
--- a/.github/workflows/auto-swap-labels.yml
+++ b/.github/workflows/auto-swap-labels.yml
@@ -2,32 +2,74 @@
 
 name: Auto Swap Labels
 on:
-  pull_request_review:
-    types: [submitted]
+  pull_request_target:
+    types: [ready_for_review, synchronize]
+    branches:
+      - main
+  workflow_run:
+    workflows: ["Review Trigger"]
+    types: [completed]
 
 permissions:
   pull-requests: write
   contents: read
+  actions: read
 
 jobs:
   check-approval:
     runs-on: ubuntu-latest
-    if: github.event.review.state == 'approved' && github.repository == 'NVIDIA/Megatron-LM'
+    if: >-
+      github.repository == 'NVIDIA/Megatron-LM' && (
+        (github.event_name == 'pull_request_target' &&
+         github.event.pull_request.base.ref == 'main' &&
+         !github.event.pull_request.draft) ||
+        (github.event_name == 'workflow_run' &&
+         github.event.workflow_run.conclusion == 'success')
+      )
+
     steps:
+      - name: Get PR number from workflow_run
+        id: get-pr
+        if: github.event_name == 'workflow_run'
+        continue-on-error: true
+        uses: actions/download-artifact@v4
+        with:
+          name: pr-number
+          path: pr-number
+          github-token: ${{ github.token }}
+          run-id: ${{ github.event.workflow_run.id }}
+
+      - name: Set PR number
+        id: pr
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_run" ]; then
+            if [ "${{ steps.get-pr.outcome }}" != "success" ]; then
+              echo "No approval artifact found — review was not an approval. Skipping."
+              exit 0
+            fi
+            echo "number=$(cat pr-number/number)" >> $GITHUB_OUTPUT
+          else
+            echo "number=${{ github.event.pull_request.number }}" >> $GITHUB_OUTPUT
+          fi
+
       - name: Check out repository code
+        if: steps.pr.outputs.number
         uses: actions/checkout@v4
 
       - name: Set up Python
-        uses: actions/setup-python@v5
+        if: steps.pr.outputs.number
+        uses: actions/setup-python@v6
         with:
           python-version: "3.10"
 
       - name: Install dependencies
+        if: steps.pr.outputs.number
         run: |
           pip install --no-cache-dir PyGithub slack-sdk
 
-      - name: Run Auto Reminder Bot
+      - name: Run Auto Swap Labels
+        if: steps.pr.outputs.number
         run: |
-          export GH_TOKEN=${{ github.token }}
-          export PR_NUMBER=${{ github.event.pull_request.number }}
+          export GH_TOKEN=${{ secrets.PAT }}
+          export PR_NUMBER=${{ steps.pr.outputs.number }}
           python tests/test_utils/python_scripts/swap_pr_labels.py
diff --git a/.github/workflows/auto-update-copy-pr-bot.yml b/.github/workflows/auto-update-copy-pr-bot.yml
index 3358a747f34..07fdcfbfbb8 100644
--- a/.github/workflows/auto-update-copy-pr-bot.yml
+++ b/.github/workflows/auto-update-copy-pr-bot.yml
@@ -11,7 +11,7 @@ jobs:
     if: github.repository == 'NVIDIA/Megatron-LM'
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v6
         with:
           token: ${{ secrets.PAT }}
           ref: main
diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
new file mode 100644
index 00000000000..1bc7dfcb6e4
--- /dev/null
+++ b/.github/workflows/build-docs.yml
@@ -0,0 +1,65 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Build docs
+
+on:
+  push:
+    branches:
+      - main
+      - "pull-request/[0-9]+"
+      - "deploy-release/*"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+jobs:
+  pre-flight:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
+
+  build-docs:
+    needs: [pre-flight]
+    if: needs.pre-flight.outputs.is_deployment_workflow != 'true'
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0
+
+  build-docs-summary:
+    needs: [pre-flight, build-docs]
+    if: |
+      (
+        needs.pre-flight.outputs.is_deployment_workflow == 'true'
+        || always()
+      )
+      && !cancelled()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Get workflow result
+        id: result
+        shell: bash -x -e -u -o pipefail {0}
+        env:
+          GH_TOKEN: ${{ github.token }}
+          RUN_ID: ${{ github.run_id }}
+          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
+        run: |
+          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
+
+          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
+              echo "✅ All previous jobs completed successfully"
+              exit 0
+          else
+              echo "❌ Found $FAILED_JOBS failed job(s)"
+              # Show which jobs failed
+              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
+              exit 1
+          fi
diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml
index 00711b50806..88b9e8b8c61 100644
--- a/.github/workflows/build-test-publish-wheel.yml
+++ b/.github/workflows/build-test-publish-wheel.yml
@@ -58,7 +58,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Result
         env:
diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml
index 2cba41eafb8..02bfbbfd3a5 100644
--- a/.github/workflows/cicd-approve-test-queue.yml
+++ b/.github/workflows/cicd-approve-test-queue.yml
@@ -27,12 +27,13 @@ jobs:
     strategy:
       matrix:
         branch: [main, dev, others]
+        contributor_type: [internal, external]
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: "3.12"
 
@@ -41,23 +42,44 @@ jobs:
           python -m pip install --upgrade pip
           pip install requests
 
+      - name: Download SSO users list
+        run: |
+          gh release download v0.1.0 \
+            --repo NVIDIA-GitHub-Management/github-audits \
+            --pattern users_sso.json \
+            --output users_sso.json || echo '{}' > users_sso.json
+        env:
+          GH_TOKEN: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
+
       - name: Approve waiting deployments
         env:
           GITHUB_TOKEN: ${{ secrets.PAT }}
           MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
+          MAX_CONCURRENCY_EXTERNAL: ${{ vars.MAX_CONCURRENCY_EXTERNAL || 3 }}
+          CONTRIBUTOR_TYPE: ${{ matrix.contributor_type }}
+          SSO_USERS_FILE: users_sso.json
           PYTHONUNBUFFERED: 1
         shell: python
         run: |
           import os
+          import json
           import requests
           import re
 
           # GitHub API configuration
           GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
           REPO = os.environ["GITHUB_REPOSITORY"]
-          MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2
+          CONTRIBUTOR_TYPE = os.environ["CONTRIBUTOR_TYPE"]
+          if CONTRIBUTOR_TYPE == "external":
+              MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"]) // 2
+          else:
+              MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2
           API_BASE = f"https://api.github.com/repos/NVIDIA/Megatron-LM"
 
+          # Load SSO users for internal/external classification
+          with open(os.environ["SSO_USERS_FILE"]) as f:
+              sso_users = json.load(f)
+
           # Headers for GitHub API
           headers = {
               "Authorization": f"token {GITHUB_TOKEN}",
@@ -81,53 +103,76 @@ jobs:
                       print(f"Response: {e.response.text}")
                   return None
 
-          def is_pr_targeting_branch(workflow_run, target_branch):
+          def is_internal_contributor(pr_info):
+              """Return True if the PR author is a member of NVIDIA or NVIDIA-NeMo org (is_org_member)."""
+              login = pr_info.get("user", {}).get("login", "")
+              org_roles = sso_users.get(login, {}).get("org_roles", [])
+              return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles)
+
+          def get_pr_base_branch(workflow_run):
               """
-              Check if a workflow run belongs to a PR targeting the given branch.
-              Extract PR number from head branch like 'pull-request/1913' and verify base branch.
+              Return the base branch of the PR associated with a workflow run, or None.
+              Extracts PR number from head branch like 'pull-request/1913' and fetches PR info.
+              Returns (base_branch, pr_info) tuple, or (None, None) if not a PR run.
               """
               print(workflow_run.get("head_branch", ""))
               head_branch = workflow_run.get("head_branch", "")
               match = re.match(r"pull-request/(\d+)", head_branch)
               if not match:
-                  return False  # Not a PR branch pattern
+                  return None, None  # Not a PR branch pattern
 
               pr_number = int(match.group(1))
-              
+
               # Fetch PR info from GitHub API
               pr_info = make_request(f"pulls/{pr_number}")
               if not pr_info:
                   print(f"Failed to fetch PR #{pr_number}")
-                  return False
+                  return None, None
 
               base_branch = pr_info.get("base", {}).get("ref")
-              if (
-                (base_branch == target_branch) or 
-                (base_branch != "main" and base_branch != "dev" and target_branch == "others")
-              ):
-                  print(f"PR #{pr_number} targets {target_branch}")
-                  return True
+              return base_branch, pr_info
+
+          def matches_queue(workflow_run, target_branch, contributor_type):
+              """
+              Return True if the workflow run belongs to this queue cell:
+              matching target branch AND matching contributor type (internal/external).
+              """
+              base_branch, pr_info = get_pr_base_branch(workflow_run)
+              if base_branch is None:
+                  return False
+
+              branch_match = (
+                  (base_branch == target_branch) or
+                  (base_branch != "main" and base_branch != "dev" and target_branch == "others")
+              )
+              if not branch_match:
+                  return False
 
-              return False
+              pr_number = re.match(r"pull-request/(\d+)", workflow_run.get("head_branch", "")).group(1)
+              internal = is_internal_contributor(pr_info)
+              contributor_match = (contributor_type == "internal") == internal
+              if branch_match and contributor_match:
+                  print(f"PR #{pr_number} targets {target_branch}, contributor_type={contributor_type} (internal={internal})")
+              return branch_match and contributor_match
 
           # Get current running and queued workflows
           print("Fetching workflow runs...")
           queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", [])
           in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", [])
 
-          # Filter for workflows belonging to PRs targeting ${{ matrix.branch }}
-          queued_workflow_runs = [run for run in queued_workflow_runs 
-                                  if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")]
-          in_progress_workflow_runs = [run for run in in_progress_workflow_runs 
-                                      if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")]
+          # Filter for workflows belonging to PRs targeting ${{ matrix.branch }} with matching contributor type
+          queued_workflow_runs = [run for run in queued_workflow_runs
+                                  if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
+          in_progress_workflow_runs = [run for run in in_progress_workflow_runs
+                                      if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
 
           # Count running and queued workflows
           queued_workflows = len(queued_workflow_runs)
           in_progress_workflows = len(in_progress_workflow_runs)
 
           total_workflows = queued_workflows + in_progress_workflows
-          print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}): {queued_workflows}")
-          print(f"Current running workflows (PRs targeting ${{ matrix.branch }}): {in_progress_workflows}")
+          print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {queued_workflows}")
+          print(f"Current running workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {in_progress_workflows}")
           print(f"Total workflows: {total_workflows}")
           print(f"Max concurrency: {MAX_CONCURRENCY}")
 
@@ -139,8 +184,8 @@ jobs:
           print("Fetching deployments...")
           pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", [])
           print("Pending workflows:", len(pending_workflows))
-          pending_workflows = [run for run in pending_workflows 
-                              if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")]
+          pending_workflows = [run for run in pending_workflows
+                              if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
 
           # Sort deployments by creation date (oldest first)
           print("Sorting workflows...")
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 2cc025baf99..f63011df63f 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -25,7 +25,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.ref || github.event.pull_request.number }}
+  group: ${{ github.workflow }}-${{ github.head_ref || github.event.merge_group.head_ref || github.ref }}
   cancel-in-progress: true
 
 permissions:
@@ -52,7 +52,7 @@ jobs:
       DISABLE_EXTERNAL_CONTRIBUTOR: ${{ vars.DISABLE_EXTERNAL_CONTRIBUTOR }}
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           token: ${{ env.GITHUB_TOKEN }}
 
@@ -145,7 +145,7 @@ jobs:
       )
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
 
@@ -208,7 +208,7 @@ jobs:
       mbridge-test-suite: ${{ steps.select-mbridge-test-suite.outputs.main }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Get PR info
         id: get-pr-info
@@ -251,7 +251,7 @@ jobs:
         uses: nv-gha-runners/get-pr-info@main
 
       - name: Checkout MBridge and create testing branch
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           ref: main
           repository: NVIDIA-NeMo/Megatron-Bridge
@@ -347,12 +347,12 @@ jobs:
           echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT"
 
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           ref: ${{ steps.sha.outputs.main }}
 
       - name: Setup python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: 3.12
 
@@ -470,7 +470,7 @@ jobs:
       && !cancelled()
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
       - name: Parse unit tests
         id: parse-unit-tests
         run: |
@@ -510,7 +510,7 @@ jobs:
       PIP_ROOT_USER_ACTION: ignore
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
       - name: main
         uses: ./.github/actions
         with:
@@ -545,7 +545,7 @@ jobs:
       integration-tests: ${{ steps.main.outputs.integration-tests }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Get PR info
         id: get-pr-info
@@ -649,7 +649,7 @@ jobs:
       && !cancelled()
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
       - name: main
         uses: ./.github/actions
         with:
@@ -682,7 +682,7 @@ jobs:
     permissions: write-all
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Get workflow result
         id: result
@@ -719,7 +719,7 @@ jobs:
       && github.repository == 'NVIDIA/Megatron-LM'
     steps:
       - name: Generate fake coverage report
-        uses: actions/github-script@v6
+        uses: actions/github-script@v8
         with:
           github-token: ${{ secrets.PAT }}
           script: |
@@ -747,10 +747,10 @@ jobs:
         flag: [unit-test]
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Download coverage reports of current branch
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
         with:
           pattern: coverage-${{ matrix.flag }}-*
 
@@ -778,7 +778,7 @@ jobs:
           flags: ${{ matrix.flag }}
 
       - name: Upload artifacts
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: coverage-${{ matrix.flag }}-aggregated
           path: |
@@ -799,7 +799,7 @@ jobs:
           echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT
 
       - name: Comment on PR with action run URL
-        uses: actions/github-script@v7
+        uses: actions/github-script@v8
         with:
           github-token: ${{ secrets.PAT }}
           script: |
@@ -826,6 +826,7 @@ jobs:
       always()
       && !cancelled()
       && contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral')
+      && !needs.pre-flight.outputs.is_deployment_workflow == 'true'
     steps:
       - name: Taint node for cleanup
         shell: bash
diff --git a/.github/workflows/claude-complexity-label.yml b/.github/workflows/claude-complexity-label.yml
new file mode 100644
index 00000000000..356eed2da29
--- /dev/null
+++ b/.github/workflows/claude-complexity-label.yml
@@ -0,0 +1,60 @@
+name: Claude Complexity Label
+
+on:
+  pull_request_target:
+    types: [ready_for_review]
+
+jobs:
+  label-complexity:
+    name: Label PR Complexity
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+      issues: write
+      id-token: write
+    env:
+      GH_TOKEN: ${{ secrets.PAT }}
+      REPO: ${{ github.repository }}
+      PR_NUMBER: ${{ github.event.pull_request.number }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Run Claude Complexity Analysis
+        uses: anthropics/claude-code-action@v1
+        with:
+          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
+          github_token: ${{ secrets.PAT }}
+          prompt: |
+            REPO: ${{ env.REPO }}
+            PR NUMBER: ${{ env.PR_NUMBER }}
+
+            You are a PR complexity analyzer. Your job is to analyze the diff of this PR and apply exactly one complexity label.
+
+            STEPS:
+            1. Get the PR diff by running: gh pr diff $PR_NUMBER --repo $REPO
+            2. Analyze every changed line (added or removed) in the diff and classify each as one of:
+               - "docs-only": changes to docstrings, comments (lines starting with # or //), documentation files (.md, .rst, .txt), or similar non-functional text
+               - "test": changes in test files (files with "test" in the name/path, or inside a tests/ directory)
+               - "real code": all other changes (functional source code)
+            3. Compute "real code line changes" using this formula:
+               real_code_line_changes = (number of real code lines changed) + (number of test lines changed / 10)
+               Count both added and removed lines. Do not count unchanged context lines. Do not count comments or docstrings.
+            4. Remove any previously applied complexity or docs-only labels:
+               gh pr edit $PR_NUMBER --repo $REPO --remove-label "complexity: low,complexity: medium,complexity: high,docs-only"
+            5. Apply exactly ONE label using the gh CLI:
+               - If there are ZERO real code lines and ZERO test lines (only docs-only changes), apply label "docs-only":
+                 gh pr edit $PR_NUMBER --repo $REPO --add-label "docs-only"
+               - If real_code_line_changes < 100, apply label "complexity: low":
+                 gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: low"
+               - If real_code_line_changes >= 100 and < 500, apply label "complexity: medium":
+                 gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: medium"
+               - If real_code_line_changes >= 500, apply label "complexity: high":
+                 gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: high"
+
+            Do NOT post any comments on the PR. Only apply the label.
+          claude_args: |
+            --allowedTools "Bash(gh pr diff:*),Bash(gh pr edit:*),Bash(gh pr view:*)"
diff --git a/.github/workflows/claude_review.yml b/.github/workflows/claude_review.yml
new file mode 100644
index 00000000000..2a159c07d20
--- /dev/null
+++ b/.github/workflows/claude_review.yml
@@ -0,0 +1,67 @@
+name: Claude Code Review
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  review-on-comment:
+    name: Claude Review (comment trigger)
+    if: |
+      github.event_name == 'issue_comment' &&
+      github.event.issue.pull_request &&
+      contains(github.event.comment.body, '/claude review')
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+      issues: write
+      id-token: write
+    env:
+      GH_TOKEN: ${{ github.token }}
+      REPO: ${{ github.repository }}
+      PR_NUMBER: ${{ github.event.issue.number }}
+    steps:
+      - name: Get PR head commit
+        id: get-pr-head-commit
+        run: |
+          echo "sha=$(gh pr view $PR_NUMBER --repo $REPO --json headRefOid -q .headRefOid)" | tee -a $GITHUB_OUTPUT
+
+      - name: Checkout repository
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 1
+          ref: ${{ steps.get-pr-head-commit.outputs.sha }}
+
+      - name: Run Claude Code Review
+        uses: anthropics/claude-code-action@v1
+        with:
+          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
+          trigger_phrase: "/claude review"
+          show_full_output: true
+          claude_args: |
+            --allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr review:*)"
+            --model "claude-opus-4-6"
+          prompt: |
+            REPO: ${{ env.REPO }}
+            PR NUMBER: ${{ env.PR_NUMBER }}
+
+            You are doing a light code review. Keep it concise and actionable.
+
+            Focus ONLY on:
+            - Critical bugs or logic errors
+            - Typos in code, comments, or strings
+            - Missing or insufficient test coverage for changed code
+            - Outdated or inaccurate documentation affected by the changes
+
+            Do NOT comment on:
+            - Style preferences or formatting
+            - Minor naming suggestions
+            - Architectural opinions or refactoring ideas
+            - Performance unless there is a clear, measurable issue
+
+            Provide feedback using inline comments for specific code suggestions.
+            Use top-level comments for general observations.
+
+            It's perfectly acceptable to not have anything to comment on.
+            If you do not have anything to comment on, post "LGTM".
diff --git a/.github/workflows/config/changelog-config.json b/.github/workflows/config/changelog-config.json
index e640b90a0f3..19fb0e42364 100644
--- a/.github/workflows/config/changelog-config.json
+++ b/.github/workflows/config/changelog-config.json
@@ -15,7 +15,7 @@
     },
     "transformers": [],
     "max_tags_to_fetch": 100,
-    "max_pull_requests": 500,
+    "max_pull_requests": 1250,
     "max_back_track_time_days": 365,
     "exclude_merge_branches": [],
     "tag_resolver": {
diff --git a/.github/workflows/copyright-check.yml b/.github/workflows/copyright-check.yml
index a7f51cd8a0e..33d30944f8d 100644
--- a/.github/workflows/copyright-check.yml
+++ b/.github/workflows/copyright-check.yml
@@ -49,7 +49,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Result
         env:
diff --git a/.github/workflows/force-draft-pr.yml b/.github/workflows/force-draft-pr.yml
new file mode 100644
index 00000000000..d45dabf14b7
--- /dev/null
+++ b/.github/workflows/force-draft-pr.yml
@@ -0,0 +1,36 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+name: Force Draft PR
+
+on:
+  pull_request_target:
+    types: [opened]
+    branches:
+      - main
+
+permissions:
+  pull-requests: write
+
+jobs:
+  force-draft:
+    runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft && github.repository == 'NVIDIA/Megatron-LM' }}
+    steps:
+      - name: Convert PR to draft
+        env:
+          GH_TOKEN: ${{ secrets.PAT }}
+        run: |
+          gh pr ready --undo ${{ github.event.pull_request.number }} --repo ${{ github.repository }}
+
+      - name: Add comment explaining draft policy
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          gh pr comment ${{ github.event.pull_request.number }} --repo ${{ github.repository }} --body \
+            "This PR has been automatically converted to **draft** because all PRs must start as drafts.
+
+          When you are ready for review, click **Ready for Review** to begin the review process. This will:
+          1. Add the oncall reviewer (optional reviewer)
+          2. Add required review teams based on your changes
+
+          See the [contribution guide](https://github.com/NVIDIA/Megatron-LM/blob/main/docs/developer/submit.md) for more details."
diff --git a/.github/workflows/install-test.yml b/.github/workflows/install-test.yml
index 5a0abb8596d..060e1c5ade0 100644
--- a/.github/workflows/install-test.yml
+++ b/.github/workflows/install-test.yml
@@ -49,7 +49,7 @@ jobs:
         python-version: ["3.12"]
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Set PATH
         run: |
@@ -65,7 +65,7 @@ jobs:
         run: bash docker/common/install.sh --environment dev --base-image pytorch --python-version ${{ matrix.python-version }}
 
       - name: Checkout check-imports
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           repository: NVIDIA-NeMo/FW-CI-templates
           ref: v0.63.2
@@ -94,7 +94,7 @@ jobs:
         python-version: ["3.12"]
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Set PATH
         run: |
@@ -113,7 +113,7 @@ jobs:
 
       # NGC PyTorch 25.05 has a version of triton that is broken on CPU only machines.
       # - name: Checkout check-imports
-      #   uses: actions/checkout@v4
+      #   uses: actions/checkout@v6
       #   with:
       #     repository: NVIDIA-NeMo/FW-CI-templates
       #     ref: v0.63.2
@@ -139,7 +139,7 @@ jobs:
       && github.repository == 'NVIDIA/Megatron-LM'
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Get workflow result
         id: result
diff --git a/.github/workflows/oncall-assign.yml b/.github/workflows/oncall-assign.yml
index d4cc47d5f9e..6da0776ffc2 100644
--- a/.github/workflows/oncall-assign.yml
+++ b/.github/workflows/oncall-assign.yml
@@ -16,7 +16,7 @@ name: Oncall Assign
 
 on:
   pull_request_target:
-    types: [opened, ready_for_review]
+    types: [ready_for_review]
     branches:
       - main
 
@@ -30,10 +30,10 @@ jobs:
     if: ${{ !github.event.pull_request.draft }}
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v6
         with:
           python-version: '3.10'
 
diff --git a/.github/workflows/oncall-rotation.yml b/.github/workflows/oncall-rotation.yml
index a621be7f652..0d5f774e441 100644
--- a/.github/workflows/oncall-rotation.yml
+++ b/.github/workflows/oncall-rotation.yml
@@ -28,12 +28,12 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           token: ${{ secrets.PAT }}
 
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v6
         with:
           python-version: "3.10"
 
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 647e6af2379..a756d49eb20 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -44,6 +44,11 @@ on:
         description: Branch for version bump
         required: true
         type: string
+      gh-release-from-tag:
+        description: Tag of previous release for changelog builder
+        required: false
+        type: string
+        default: ""
 
 permissions:
   contents: write # To read repository content
@@ -59,7 +64,16 @@ jobs:
       create-gh-release: ${{ inputs.create-gh-release || true }}
       gh-release-use-changelog-builder: ${{ inputs.generate-changelog }}
       publish-docs: ${{ inputs.publish-docs }}
+      gh-release-from-tag: ${{ inputs.gh-release-from-tag }}
     secrets:
       TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}
       SLACK_WEBHOOK: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SLACK_MAIN_CHANNEL_WEBHOOK || secrets.SLACK_CI_CHANNEL_WEBHOOK }}
       PAT: ${{ secrets.PAT }}
+      AWS_ASSUME_ROLE_ARN: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AKAMAI_HOST: ${{ secrets.AKAMAI_HOST }}
+      AKAMAI_CLIENT_TOKEN: ${{ secrets.AKAMAI_CLIENT_TOKEN }}
+      AKAMAI_CLIENT_SECRET: ${{ secrets.AKAMAI_CLIENT_SECRET }}
+      AKAMAI_ACCESS_TOKEN: ${{ secrets.AKAMAI_ACCESS_TOKEN }}
+      S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }}
diff --git a/.github/workflows/review-trigger.yml b/.github/workflows/review-trigger.yml
new file mode 100644
index 00000000000..28abf259882
--- /dev/null
+++ b/.github/workflows/review-trigger.yml
@@ -0,0 +1,28 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Lightweight workflow that triggers on review approval, otherwise there is no access to right secret.
+# No secrets needed — just signals auto-swap-labels.yml via workflow_run.
+
+name: Review Trigger
+
+on:
+  pull_request_review:
+    types: [submitted]
+
+jobs:
+  signal:
+    runs-on: ubuntu-latest
+    if: >-
+      github.event.review.state == 'approved' &&
+      github.event.pull_request.base.ref == 'main' &&
+      github.repository == 'NVIDIA/Megatron-LM'
+    steps:
+      - name: Save PR number
+        run: |
+          mkdir -p pr
+          echo "${{ github.event.pull_request.number }}" > pr/number
+      - name: Upload PR number
+        uses: actions/upload-artifact@v4
+        with:
+          name: pr-number
+          path: pr/
diff --git a/.github/workflows/sync-team-usergroups.yml b/.github/workflows/sync-team-usergroups.yml
index fb48a6ca5d4..7f32ac55c57 100644
--- a/.github/workflows/sync-team-usergroups.yml
+++ b/.github/workflows/sync-team-usergroups.yml
@@ -24,10 +24,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
 
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v6
         with:
           python-version: "3.10"
 
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a238f2c9999..2eb1b43be0c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,16 +1,16 @@
 .merge_train_rule: &merge_train_rule
-  UNIT_TEST: 'yes'
+  UNIT_TEST: "yes"
   UNIT_TEST_REPEAT: 1
   UNIT_TEST_TIMEOUT: 30
-  INTEGRATION_TEST: 'no'
+  INTEGRATION_TEST: "no"
   INTEGRATION_TEST_SCOPE: mr
-  FUNCTIONAL_TEST: 'yes'
+  FUNCTIONAL_TEST: "yes"
   FUNCTIONAL_TEST_SCOPE: mr-slim
   FUNCTIONAL_TEST_REPEAT: 1
   FUNCTIONAL_TEST_TIME_LIMIT: 2700
-  CLUSTER_A100: ''
-  CLUSTER_H100: ''
-  PUBLISH: 'no'
+  CLUSTER_A100: ""
+  CLUSTER_H100: ""
+  PUBLISH: "no"
 
 workflow:
   rules:
@@ -29,36 +29,42 @@ workflow:
       auto_cancel:
         on_new_commit: none
 
-    # For manual pipelines
+    # For manual pipelines (GitLab UI)
     - if: $CI_PIPELINE_SOURCE == "web"
 
+    # For pipelines created via the REST API (personal access token)
+    - if: $CI_PIPELINE_SOURCE == "api"
+
+    # For trigger pipelines
+    - if: $CI_PIPELINE_SOURCE == "trigger"
+
     # For push to main
     - if: $CI_PIPELINE_SOURCE == 'push' && ($CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH =~ /^core_/)
       variables:
-        UNIT_TEST: 'no'
-        INTEGRATION_TEST: 'no'
-        FUNCTIONAL_TEST: 'yes'
+        UNIT_TEST: "no"
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: mr
         FUNCTIONAL_TEST_REPEAT: 5
-        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no'
+        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
         FUNCTIONAL_TEST_TIME_LIMIT: 3600
-        CLUSTER_A100: ''
-        CLUSTER_H100: ''
-        PUBLISH: 'no'
+        CLUSTER_A100: ""
+        CLUSTER_H100: ""
+        PUBLISH: "no"
       auto_cancel:
         on_new_commit: interruptible
 
     # For merge-trains that need to be fast-tracked
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/
       variables:
-        UNIT_TEST: 'yes'
+        UNIT_TEST: "yes"
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: 'no'
-        FUNCTIONAL_TEST: 'no'
-        CLUSTER_A100: ''
-        CLUSTER_H100: ''
-        PUBLISH: 'no'
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "no"
+        CLUSTER_A100: ""
+        CLUSTER_H100: ""
+        PUBLISH: "no"
 
     # For normal merge-trains
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train'
@@ -67,75 +73,75 @@ workflow:
     # For MRs with integration suite
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run tests/
       variables:
-        UNIT_TEST: 'yes'
+        UNIT_TEST: "yes"
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: 'yes'
+        INTEGRATION_TEST: "yes"
         INTEGRATION_TEST_SCOPE: mr
-        FUNCTIONAL_TEST: 'no'
+        FUNCTIONAL_TEST: "no"
         FUNCTIONAL_TEST_SCOPE: mr-slim
         FUNCTIONAL_TEST_REPEAT: 1
         FUNCTIONAL_TEST_TIME_LIMIT: 2700
-        CLUSTER_A100: ''
-        CLUSTER_H100: ''
-        PUBLISH: 'no'
+        CLUSTER_A100: ""
+        CLUSTER_H100: ""
+        PUBLISH: "no"
 
     # For MRs with nightly
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run nightly/
       variables:
-        UNIT_TEST: 'yes'
+        UNIT_TEST: "yes"
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: 'no'
-        FUNCTIONAL_TEST: 'yes'
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: nightly
         FUNCTIONAL_TEST_REPEAT: 5
-        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no'
+        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
         FUNCTIONAL_TEST_TIME_LIMIT: 2700
-        CLUSTER_A100: ''
-        CLUSTER_H100: ''
-        PUBLISH: 'no'
+        CLUSTER_A100: ""
+        CLUSTER_H100: ""
+        PUBLISH: "no"
 
     # For MRs with weekly
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run weekly/
       variables:
-        UNIT_TEST: 'yes'
+        UNIT_TEST: "yes"
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: 'no'
-        FUNCTIONAL_TEST: 'yes'
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: weekly
         FUNCTIONAL_TEST_REPEAT: 1
-        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: 'no'
+        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
         FUNCTIONAL_TEST_TIME_LIMIT: 9000
-        CLUSTER_A100: ''
-        CLUSTER_H100: ''
-        PUBLISH: 'no'
+        CLUSTER_A100: ""
+        CLUSTER_H100: ""
+        PUBLISH: "no"
 
     # For MRs with heavy suite
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run functional tests/
       variables:
-        UNIT_TEST: 'yes'
+        UNIT_TEST: "yes"
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: 'no'
-        FUNCTIONAL_TEST: 'yes'
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: mr
         FUNCTIONAL_TEST_REPEAT: 1
         FUNCTIONAL_TEST_TIME_LIMIT: 2700
-        CLUSTER_A100: ''
-        CLUSTER_H100: ''
-        PUBLISH: 'no'
+        CLUSTER_A100: ""
+        CLUSTER_H100: ""
+        PUBLISH: "no"
 
     # Default MRs
     - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result'
       variables:
-        UNIT_TEST: 'yes'
+        UNIT_TEST: "yes"
         UNIT_TEST_REPEAT: 1
         UNIT_TEST_TIMEOUT: 30
-        INTEGRATION_TEST: 'no'
-        FUNCTIONAL_TEST: 'no'
-        PUBLISH: 'no'
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "no"
+        PUBLISH: "no"
 
     - when: never
 
@@ -157,109 +163,109 @@ default:
 
 variables:
   BUILD:
-    value: 'yes'
+    value: "yes"
   UNIT_TEST:
-    value: 'yes'
+    value: "yes"
     options:
-      - 'yes'
-      - 'no'
+      - "yes"
+      - "no"
     description: To run the funtional test suite
   UNIT_TEST_REPEAT:
-    value: '1'
-    description: 'Number of repetitions'
+    value: "1"
+    description: "Number of repetitions"
   UNIT_TEST_TIMEOUT:
-    value: '30'
+    value: "30"
     description: Timeout (minutes) for Unit tests (all repeats)
   INTEGRATION_TEST:
-    value: 'yes'
+    value: "yes"
     options:
-      - 'yes'
-      - 'no'
+      - "yes"
+      - "no"
     description: To run the integration test suite
   INTEGRATION_TEST_SCOPE:
-    value: 'mr'
+    value: "mr"
     options:
-      - 'mr'
-      - 'nightly'
-      - 'weekly'
-      - 'pre-release'
-      - 'release'
-    description: 'Testsuite to run (only for INTEGRATION_TEST=yes)'
+      - "mr"
+      - "nightly"
+      - "weekly"
+      - "pre-release"
+      - "release"
+    description: "Testsuite to run (only for INTEGRATION_TEST=yes)"
   INTEGRATION_TEST_TIME_LIMIT:
-    value: '900'
-    description: 'Timeout in seconds per test'
+    value: "900"
+    description: "Timeout in seconds per test"
   INTEGRATION_TEST_CASES:
-    value: 'all'
+    value: "all"
     description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
   FUNCTIONAL_TEST:
-    value: 'yes'
+    value: "yes"
     options:
-      - 'yes'
-      - 'no'
+      - "yes"
+      - "no"
     description: To run the funtional test suite
   FUNCTIONAL_TEST_SCOPE:
-    value: 'mr'
+    value: "mr"
     options:
-      - 'mr'
-      - 'nightly'
-      - 'weekly'
-      - 'pre-release'
-      - 'release'
-    description: 'Testsuite to run (only for FUNCTIONAL_TEST=yes)'
+      - "mr"
+      - "nightly"
+      - "weekly"
+      - "pre-release"
+      - "release"
+    description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
   FUNCTIONAL_TEST_REPEAT:
-    value: '5'
-    description: 'Number of repetitions per test'
+    value: "5"
+    description: "Number of repetitions per test"
   FUNCTIONAL_TEST_TIME_LIMIT:
-    value: '2700'
-    description: 'Timeout in seconds per test'
+    value: "2700"
+    description: "Timeout in seconds per test"
   FUNCTIONAL_TEST_CASES:
-    value: 'all'
+    value: "all"
     description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
   FUNCTIONAL_TEST_NAME:
-    description: 'Name of functional test run (only for pre-release and release)'
-    value: '$$CI_COMMIT_SHA'
+    description: "Name of functional test run (only for pre-release and release)"
+    value: "$$CI_COMMIT_SHA"
   FUNCTIONAL_TEST_RECORD_CHECKPOINTS:
-    value: 'no'
-    description: 'Record golden checkpoints'
+    value: "no"
+    description: "Record golden checkpoints"
     options:
-      - 'yes'
-      - 'no'
+      - "yes"
+      - "no"
   CLUSTER_A100:
-    value: 'dgxa100_dracooci'
+    value: "dgxa100_dracooci"
     options:
-      - 'dgxa100_dracooci'
-      - 'dgxa100_dracooci-ord'
-    description: 'Cluster for A100 workloads'
+      - "dgxa100_dracooci"
+      - "dgxa100_dracooci-ord"
+    description: "Cluster for A100 workloads"
   CLUSTER_H100:
-    value: 'dgxh100_coreweave'
+    value: "dgxh100_coreweave"
     options:
-      - 'dgxh100_coreweave'
-      - 'dgxh100_eos'
-    description: 'Cluster for H100 workloads'
+      - "dgxh100_coreweave"
+      - "dgxh100_eos"
+    description: "Cluster for H100 workloads"
   CLUSTER_GB200:
-    value: 'dgxgb200_oci-hsg'
+    value: "dgxgb200_oci-hsg"
     options:
-      - 'dgxgb200_oci-hsg'
-    description: 'Cluster for H100 workloads'
+      - "dgxgb200_oci-hsg"
+    description: "Cluster for H100 workloads"
   PUBLISH:
-    value: 'no'
+    value: "no"
     options:
-      - 'yes'
-      - 'no'
+      - "yes"
+      - "no"
     description: Build and publish a wheel to PyPi
   PUBLISH_COMMIT:
-    value: '$$CI_COMMIT_SHA'
+    value: "$$CI_COMMIT_SHA"
     description: Which commit to publish
   PUBLISH_VERSION_BUMP_BRANCH:
-    value: '$$CI_COMMIT_BRANCH'
+    value: "$$CI_COMMIT_BRANCH"
     description: Which branch to target for version bump
   PUBLISH_SCOPE:
-    value: 'code-freeze'
+    value: "code-freeze"
     options:
-      - 'code-freeze'
-      - 'release'
-      - 'review-reminder'
-      - 'upgrade-dependencies'
+      - "code-freeze"
+      - "release"
+      - "review-reminder"
+      - "upgrade-dependencies"
     description: Type of publish (freeze or final release)
 
   # CI wide variables
@@ -267,7 +273,7 @@ variables:
   CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
   CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
   UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility
-  TE_GIT_REF: ''
+  TE_GIT_REF: ""
 
 include:
   - .gitlab/stages/00.pre.yml
diff --git a/.gitlab/stages/01.build.yml b/.gitlab/stages/01.build.yml
index 61521295a93..f5a80c4074d 100644
--- a/.gitlab/stages/01.build.yml
+++ b/.gitlab/stages/01.build.yml
@@ -64,12 +64,12 @@ test:pre_build_image:
       - IMAGE: CI_MCORE_DEV_IMAGE
         FILE: Dockerfile.ci.dev
         IMAGE_TYPE: dev
-        BASE_IMAGE: nvcr.io/nvidia/pytorch:25.11-py3
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:26.02-py3
         PLATFORM: amd64
       - IMAGE: CI_MCORE_DEV_IMAGE
         FILE: Dockerfile.ci.dev
         IMAGE_TYPE: dev
-        BASE_IMAGE: nvcr.io/nvidia/pytorch:25.11-py3
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:26.02-py3
         PLATFORM: arm64
       - IMAGE: UTILITY_IMAGE
         FILE: Dockerfile.linting
diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 00000000000..aa37017f082
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,14 @@
+comment: false
+coverage:
+  status:
+    project: false
+    patch:
+      default:
+        target: 80%
+        threshold: 5%
+        base: auto
+        if_ci_failed: error
+        if_no_uploads: success
+        if_not_found: success
+fixes:
+  - "/opt/megatron-lm/::"
diff --git a/docs/api-backwards-compatibility-check.md b/docs/api-backwards-compatibility-check.md
index e1b6939b06f..40f56ec0c00 100644
--- a/docs/api-backwards-compatibility-check.md
+++ b/docs/api-backwards-compatibility-check.md
@@ -1,3 +1,7 @@
+---
+orphan: true
+---
+
 <!---
    Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
    NVIDIA CORPORATION and its licensors retain all intellectual property
diff --git a/docs/api-guide/core/transformer.md b/docs/api-guide/core/transformer.md
index 622fb006f99..d35144fda4f 100644
--- a/docs/api-guide/core/transformer.md
+++ b/docs/api-guide/core/transformer.md
@@ -13,8 +13,7 @@ The `transformer` package provides a customizable and configurable
 implementation of the transformer model architecture. Each component
 of a transformer stack, from entire layers down to individual linear
 layers, can be customized by swapping in different PyTorch modules
-using the "spec" parameters (see [here](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/nlp/nemo_megatron/mcore_customization.html)). The
+using the "spec" parameters. The
 configuration of the transformer (hidden size, number of layers,
 number of attention heads, etc.) is provided via a `TransformerConfig`
 object.
-
diff --git a/docs/api-guide/index.md b/docs/api-guide/index.md
index 7afa2450dd0..ec4a1e5fb07 100644
--- a/docs/api-guide/index.md
+++ b/docs/api-guide/index.md
@@ -14,6 +14,7 @@ API reference documentation for Megatron Core components.
 ```{toctree}
 :maxdepth: 3
 
+router_replay
 models/index
 core/index
 internal/index
diff --git a/docs/api-guide/router_replay.md b/docs/api-guide/router_replay.md
index 4c1dc98b2b6..88476ea7537 100644
--- a/docs/api-guide/router_replay.md
+++ b/docs/api-guide/router_replay.md
@@ -9,19 +9,19 @@
 
 # Design Document: MoE Router Replay Feature
 
-### 1. Overview
+## 1. Overview
 
 This document provides a detailed description of the "Router Replay" feature implemented within the Megatron-LM Core for Mixture-of-Experts (MoE) models.
 
 This feature is designed to enhance determinism and analyzability in MoE model training and inference. It enables the model to load routing decisions from a predefined file and enforce their use during the forward pass, thereby bypassing the real-time routing computation.
 
-### 2. Motivation
+## 2. Motivation
 
 *   **Determinism & Reproducibility**: In distributed training, MoE routing decisions can exhibit minor variations due to factors like floating-point precision. By replaying a fixed routing table, the MoE computation path is guaranteed to be identical across runs, which facilitates debugging and reproducing experimental results.
 *   **Performance Profiling**: The router's own computation (e.g., logits calculation, top-k selection) incurs overhead. In replay mode, this part of the computation can be completely skipped, allowing for more precise isolation and profiling of performance bottlenecks within the Expert Layers themselves.
 *   **Debugging Aid**: When issues arise in the model, fixing the routing decisions helps to isolate variables, making it easier to determine whether the problem lies with the routing mechanism or the expert computations.
 
-### 3. Design and Architecture
+## 3. Design and Architecture
 
 The design follows the principles of being non-intrusive and on-demand, with the core idea of activating the replay logic only when explicitly requested by the user.
 
@@ -44,7 +44,7 @@ The design follows the principles of being non-intrusive and on-demand, with the
             *   For each micro-batch (processed in reverse order in pipeline parallelism), the `router_replay_action` is checked again.
             *   **In `backward_replay` mode**: The function retrieves the expert indices for the corresponding micro-batch by popping them from the `replay_backward_list`. This mode is intended for training recomputation (e.g., activation checkpointing and pipeline recompute) so the same routing decisions are used during recompute/backward as in forward, ensuring determinism and correctness.
 
-### 4. Implementation Details
+## 4. Implementation Details
 
 The implementation cleanly separates the replay logic from the router's core computation.
 
@@ -60,11 +60,12 @@ The implementation cleanly separates the replay logic from the router's core com
         *   `record_indices()`: A method to save the computed indices.
     *   The `topk_routing_with_score_function` is modified to contain the core logic. It checks the `router_replay_action` on the `router_replay` instance and accordingly performs one of the following actions: computes and records indices, replays indices from `target_topk_idx` (for forward), replays indices from `replay_backward_list` (for backward), or falls through to the default dynamic routing.
 
-#### Training recompute usage
+### Training recompute usage
+
 - During forward replay, `set_target_indices()` prepares `replay_backward_list` so each micro-batch’s indices are available for recomputation.
 - During recompute/backward, set action to `REPLAY_BACKWARD` so indices are consumed in FIFO order to mirror the forward sequence.
 
-### 5. Usage Guide
+## 5. Usage Guide
 
 1.  **Enable & Instantiate**
     - Create one `RouterReplay` instance per MoE router layer when building the model.
@@ -82,7 +83,7 @@ The implementation cleanly separates the replay logic from the router's core com
 5.  **Cleanup**
     - Use `RouterReplay.clear_global_indices()`, `RouterReplay.clear_global_router_replay_action()`, and `RouterReplay.clear_global_router_replay_instances()` to restore default behavior and prevent memory leaks.
 
-#### Quick usage with `topk_routing_with_score_function`
+### Quick usage with `topk_routing_with_score_function`
 
 ```python
 import torch
@@ -114,7 +115,7 @@ RouterReplay.clear_global_indices()
 RouterReplay.clear_global_router_replay_instances()
 ```
 
-### 6. Minimal Demo
+## 6. Minimal Demo
 
 Here is a minimal code example showing how to use RouterReplay for recording and replaying:
 
diff --git a/docs/broken_links_false_positives.json b/docs/broken_links_false_positives.json
new file mode 100644
index 00000000000..01377be5804
--- /dev/null
+++ b/docs/broken_links_false_positives.json
@@ -0,0 +1,3 @@
+{
+    "uri": "http://localhost:8080/"
+}
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index 84ed89753f5..9bf0b99c706 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -63,6 +63,12 @@
 ]
 myst_heading_anchors = 5  # Generates anchor links for headings up to level 5
 
+# Suppress "more than one target found for cross-reference" warnings for Python symbols
+# that have the same name across multiple modules (e.g. DistributedDataParallelConfig,
+# ModelType). These are structural ambiguities in the codebase – the cross-reference
+# still resolves; Sphinx just cannot pick the unique target automatically.
+suppress_warnings = ["ref.python"]
+
 # -- Options for Autodoc2 ---------------------------------------------------
 sys.path.insert(0, os.path.abspath(".."))
 
@@ -81,6 +87,11 @@
     autodoc2_docstring_parser_regexes = [
         (r".*", "docs.autodoc2_docstrings_parser"),
     ]
+    # Regex patterns whose values contain raw regex syntax (e.g. \p{L}) that docutils
+    # mis-parses as footnote/reference markup. Exclude them from the generated docs.
+    autodoc2_hidden_regexes = [
+        r".*\._PATTERN_TIKTOKEN.*",
+    ]
 
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
@@ -98,16 +109,7 @@
             "icon": "fa-brands fa-github",
         }
     ],
-    "extra_head": {
-        """
-    <script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js" ></script>
-    """
-    },
-    "extra_footer": {
-        """
-    <script type="text/javascript">if (typeof _satellite !== "undefined") {_satellite.pageBottom();}</script>
-    """
-    },
+    "public_docs_features": True
 }
 html_extra_path = ["project.json", "versions1.json"]
 
diff --git a/docs/developer/oncall.md b/docs/developer/oncall.md
index ee5582ca24d..0e5b38e2708 100644
--- a/docs/developer/oncall.md
+++ b/docs/developer/oncall.md
@@ -40,13 +40,13 @@ Below is the checklist that the oncall needs to go through for each PR.
 - Do all tests pass?
   - Oncall will need to kick off testing suite for external reviewers
   - Comment “/ok to test commid_id” to kick off testing suite
-- Add the “Expert Review” label
-  - Select an expert reviewer from each expert group as a reviewer. If you’re unsure who to select, pick a “maintainer” or manager.
+- Expert reviewers are notified after the PR is marked “Ready for Review”
   - **Expert reviewers should review within 1 business day.** Message the assigned reviewer if it is taking longer. The reviewer either needs to review the PR or suggest an alternate reviewer.
-  - If the reviewer is not responding after 2 business days, escalate to the reviewer's manager.
-- Add the “Final Review” label after experts approve
+  - If the reviewer is not responding after 2 business days, escalate to the reviewer’s manager.
+- For `megatron/core` PRs, the “Final Review” label is applied automatically once all expert reviewers approve
   - Final reviewers should review within 1 business day. Message the assigned reviewer if it is taking longer.
-  - If the reviewer is not responding after 2 business days, escalate to the reviewer's manager.
+  - If the reviewer is not responding after 2 business days, escalate to the reviewer’s manager.
+- The “Approved” label is applied automatically once all required reviewers have approved
 
 ## Issues and Discussion Questions
 
diff --git a/docs/developer/submit.md b/docs/developer/submit.md
index a46df22f85c..205e18cc52f 100644
--- a/docs/developer/submit.md
+++ b/docs/developer/submit.md
@@ -9,17 +9,26 @@
 
 # How to Submit a PR
 
-## Step 1: Add PR label `Expert Review`
+All PRs start as **draft**. If you open a non-draft PR, it will be automatically converted to draft.
 
-## Step 2: Collect the expert reviewers reviews
+## Step 1: Mark PR as "Ready for Review"
 
-1. Attach the `Expert Review` label when your PR is ready for review.
-2. GitHub auto-assigns expert reviewers based on your changes. They will get notified and pick up your PR soon.
+1. When your PR is ready, click **Ready for Review**.
+2. The oncall reviewer is auto-assigned and expert reviewers are notified based on your changes. They will get notified and pick up your PR soon.
 
-:warning: Only proceed to the next step once all reviewers have approved, merge-conflict are resolved and the CI is passing.  
+:warning: Only mark as ready once all merge-conflicts are resolved and the CI is passing.
 Final Review might get declined if these requirements are not fulfilled.
 
-## Step 3: Final Review
+## Step 2: Final Review (`megatron/core` only)
 
-1. Add `Final Review` label
-2. GitHub auto-assigns final reviewers based on your changes. They will get notified and pick up your PR soon.
+For PRs that change `megatron/core`, once all expert reviewers have approved, the `Final Review` label is applied **automatically** and final reviewers are assigned.
+
+For PRs outside `megatron/core`, this step is skipped.
+
+## Step 3: Approved
+
+Once all required reviewers have approved, the `Approved` label is applied **automatically**. The PR is now ready to merge.
+
+## Step 4: Merge
+
+Any member of [mcore-engineers](https://github.com/orgs/NVIDIA/teams/mcore-engineers) will be able to merge your PR.
diff --git a/docs/discussions/README.md b/docs/discussions/README.md
index b95300649d1..e791ed57cd8 100644
--- a/docs/discussions/README.md
+++ b/docs/discussions/README.md
@@ -1,3 +1,7 @@
+---
+orphan: true
+---
+
 <!---
    Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
    NVIDIA CORPORATION and its licensors retain all intellectual property
diff --git a/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md b/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md
index 5ef71043f86..b5de090ab46 100644
--- a/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md
+++ b/docs/discussions/megatron-fsdp-user-guide/megatron-fsdp-user-guide.md
@@ -1,3 +1,7 @@
+---
+orphan: true
+---
+
 <!---
    Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
    NVIDIA CORPORATION and its licensors retain all intellectual property
diff --git a/docs/documentation.md b/docs/documentation.md
index 652a142aec9..d2554157b45 100644
--- a/docs/documentation.md
+++ b/docs/documentation.md
@@ -1,3 +1,7 @@
+---
+orphan: true
+---
+
 <!---
    Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
    NVIDIA CORPORATION and its licensors retain all intellectual property
@@ -7,10 +11,6 @@
    license agreement from NVIDIA CORPORATION is strictly prohibited.
 -->
 
----
-orphan: true
----
-
 # Documentation Development
 
 - [Documentation Development](#documentation-development)
diff --git a/docs/get-started/install.md b/docs/get-started/install.md
index e1d7202b3fc..5781d065fae 100644
--- a/docs/get-started/install.md
+++ b/docs/get-started/install.md
@@ -7,90 +7,117 @@
    license agreement from NVIDIA CORPORATION is strictly prohibited.
 -->
 
-# Megatron Core Installation
-
-Installation is supported using Docker and pip.
+# Installation
 
 ## System Requirements
 
-### Hardware Requirements
+### Hardware
 
-- **FP8 Support**: NVIDIA Hopper, Ada, Blackwell GPUs
 - **Recommended**: NVIDIA Turing architecture or later
+- **FP8 Support**: Requires NVIDIA Hopper, Ada, or Blackwell GPUs
 
-### Software Requirements
+### Software
 
-- **CUDA/cuDNN/NCCL**: Latest stable versions
-- **PyTorch**: Latest stable version
-- **Transformer Engine**: Latest stable version
-- **Python**: 3.12 recommended
+- **Python**: >= 3.10 (3.12 recommended)
+- **PyTorch**: >= 2.6.0
+- **CUDA Toolkit**: Latest stable version
 
 
-## Docker Installation (Recommended)
+## Prerequisites
 
-We strongly recommend using the previous releases of [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) rather than the latest one for optimal compatibility with Megatron Core release and testing matrix. Our releases are always based on the previous month's NGC container, so this ensures compatibility and stability.
+Install [uv](https://docs.astral.sh/uv/), a fast Python package installer:
 
-**Note:** The NGC PyTorch container constraints the python environment globally via `PIP_CONSTRAINT`. In the following examples we will unset the variable.
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
 
-This container comes with all dependencies pre-installed with compatible versions and optimized configurations for NVIDIA GPUs:
 
-- PyTorch (latest stable version)
-- CUDA, cuDNN, NCCL (latest stable versions)
-- Support for FP8 on NVIDIA Hopper, Ada, and Blackwell GPUs
-- For best performance, use NVIDIA Turing GPU architecture generations and later
+## Option A: Pip Install (Recommended)
+
+Install the latest stable release from PyPI:
 
 ```bash
-# Run container with mounted directories
-docker run --runtime --nvidia --gpus all -it --rm \
-  -v /path/to/megatron:/workspace/megatron \
-  -v /path/to/dataset:/workspace/dataset \
-  -v /path/to/checkpoints:/workspace/checkpoints \
-  -e PIP_CONSTRAINT= \
-  nvcr.io/nvidia/pytorch:25.04-py3
+uv pip install megatron-core
 ```
 
-## Pip Installation
+To include optional training dependencies (Weights & Biases, SentencePiece, HF Transformers):
+
+```bash
+uv pip install "megatron-core[training]"
+```
 
-Megatron Core installation offers support for two NGC PyTorch containers:
+For all extras including [Transformer Engine](https://github.com/NVIDIA/TransformerEngine):
 
-- `dev`: Moving head that supports the most recent upstream dependencies
-- `lts`: Long-term support of NGC PyTorch 24.01
+```bash
+uv pip install --group build
+uv pip install --no-build-isolation "megatron-core[training,dev]"
+```
 
-Both containers can be combined with `mlm`, which adds package dependencies for Megatron-LM on top of Megatron Core.
+```{note}
+`--no-build-isolation` requires build dependencies to be pre-installed in the environment. `torch` is needed because several `[dev]` packages (`mamba-ssm`, `nv-grouped-gemm`, `transformer-engine`) import it at build time to compile CUDA kernels. Expect this step to take **20+ minutes** depending on your hardware. If you prefer pre-built binaries, the [NGC Container](#option-c-ngc-container) ships with these pre-compiled.
+```
 
+```{warning}
+Building from source can consume a large amount of memory. By default the build runs one compiler job per CPU core, which may cause out-of-memory failures on machines with many cores. To limit parallel compilation jobs, set the `MAX_JOBS` environment variable before installing (e.g. `MAX_JOBS=4`).
+```
 
-1. Install the latest release dependencies
+```{tip}
+For a lighter set of development dependencies without Transformer Engine and ModelOpt, use `[lts]` instead of `[dev]`: `uv pip install --no-build-isolation "megatron-core[training,lts]"`. The `[lts]` and `[dev]` extras are mutually exclusive.
+```
 
-    ```bash
-    pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
-    pip install --no-build-isolation megatron-core[dev]
-    ```
+To clone the repository for examples:
 
-2. Next choose one of the following options:
+```bash
+git clone https://github.com/NVIDIA/Megatron-LM.git
+```
 
-* For running an Megatron LM application
 
-        ```bash
-        pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
-        pip install --no-build-isolation megatron-core[mlm,dev]
-        ```
-* Install packages for LTS support NGC PyTorch 24.01
+## Option B: Install from Source
 
-        ```bash
-        pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
-        pip install --no-build-isolation megatron-core[lts]
-        ```
+For development or to run the latest unreleased code:
 
-* For running an Megatron LM application
+```bash
+git clone https://github.com/NVIDIA/Megatron-LM.git
+cd Megatron-LM
+uv pip install -e .
+```
 
-        ```bash
-        pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
-        pip install --no-build-isolation megatron-core[mlm,lts]
-        ```
+To install with all development dependencies (includes Transformer Engine, requires pre-installed build deps):
 
-* For a version of Megatron Core with only Torch, run
+```bash
+uv pip install --group build
+uv pip install --no-build-isolation -e ".[training,dev]"
+```
+
+```{tip}
+If the build runs out of memory, limit parallel compilation jobs with `MAX_JOBS=4 uv pip install --no-build-isolation -e ".[training,dev]"`.
+```
+
+
+## Option C: NGC Container
+
+For a pre-configured environment with all dependencies pre-installed (PyTorch, CUDA, cuDNN, NCCL, Transformer Engine), use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch).
+
+We recommend using the **previous month's** NGC container rather than the latest one to ensure compatibility with the current Megatron Core release and testing matrix.
+
+```bash
+docker run --gpus all -it --rm \
+  -v /path/to/dataset:/workspace/dataset \
+  -v /path/to/checkpoints:/workspace/checkpoints \
+  -e PIP_CONSTRAINT= \
+  nvcr.io/nvidia/pytorch:26.01-py3
+```
+
+```{note}
+The NGC PyTorch container constrains the Python environment globally via `PIP_CONSTRAINT`. The `-e PIP_CONSTRAINT=` flag above unsets this so that Megatron Core and its dependencies install correctly.
+```
+
+Then install Megatron Core inside the container (torch is already available in the NGC image):
+
+```bash
+pip install uv
+uv pip install --no-build-isolation "megatron-core[training,dev]"
+```
 
-        ```bash
-        pip install megatron-core
-        ```
 
+You are now ready to run training. See [Your First Training Run](quickstart.md) for next steps.
diff --git a/docs/get-started/overview.md b/docs/get-started/overview.md
index 42b964d5cec..b7f84ee22e5 100644
--- a/docs/get-started/overview.md
+++ b/docs/get-started/overview.md
@@ -87,7 +87,7 @@ After training or modifying a Megatron model, you can convert it again for deplo
 - **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
 - **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
 - **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
-- **[Model Optimizer (ModelOpt)](https://github.com/NVIDIA/Model-Optimizer)** - Model optimization toolkit for quantization, pruning, distillation, speculative decoding, and more. Checkout end-to-end examples in [examples/post_training/modelopt](./examples/post_training/modelopt/).
+- **[Model Optimizer (ModelOpt)](https://github.com/NVIDIA/Model-Optimizer)** - Model optimization toolkit for quantization, pruning, distillation, speculative decoding, and more. Checkout end-to-end examples in [examples/post_training/modelopt](https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt).
 
 **Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
 
diff --git a/docs/get-started/quickstart.md b/docs/get-started/quickstart.md
index 2addcb519a2..c8797aeedd4 100644
--- a/docs/get-started/quickstart.md
+++ b/docs/get-started/quickstart.md
@@ -7,54 +7,40 @@
    license agreement from NVIDIA CORPORATION is strictly prohibited.
 -->
 
-# Quick Start
+# Your First Training Run
 
-## Quick Installation
+This guide walks you through running your first training jobs with Megatron Core. Make sure you have completed [installation](install.md) before proceeding.
 
-Install Megatron Core with pip:
+## Simple Training Example
 
-1. Install Megatron Core with required dependencies:
-
-    ```bash
-    pip install --no-build-isolation megatron-core[mlm,dev]
-    ```
-
-2. Clone repository for examples:
-
-    ```bash
-    git clone https://github.com/NVIDIA/Megatron-LM.git
-    cd Megatron-LM
-    pip install --no-build-isolation .[mlm,dev]
-    ```
-
-That's it! You're ready to start training.
-
-## Your First Training Run
-
-### Simple Training Example
+Run a minimal distributed training loop with mock data on 2 GPUs:
 
 ```bash
-# Distributed training example (2 GPUs, mock data)
 torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py
 ```
 
-### LLaMA-3 Training Example
+## LLaMA-3 Training Example
+
+Train a LLaMA-3 8B model with FP8 precision on 8 GPUs using mock data:
 
 ```bash
-# 8 GPUs, FP8 precision, mock data
-./examples/llama/train_llama3_8b_fp8.sh
+./examples/llama/train_llama3_8b_h100_fp8.sh
 ```
 
 ## Data Preparation
 
-### JSONL Data Format
+To train on your own data, Megatron expects preprocessed binary files (`.bin` and `.idx`).
+
+### 1. Prepare a JSONL File
+
+Each line should contain a `text` field:
 
 ```json
 {"text": "Your training text here..."}
 {"text": "Another training sample..."}
 ```
 
-### Basic Preprocessing
+### 2. Preprocess the Data
 
 ```bash
 python tools/preprocess_data.py \
diff --git a/docs/index.md b/docs/index.md
index c68f608a73b..4b75ed2c0c8 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -38,8 +38,8 @@ get-started/releasenotes
 :hidden:
 :caption: Get Started
 
-get-started/quickstart
 get-started/install
+get-started/quickstart
 ```
 
 ```{toctree}
@@ -71,6 +71,7 @@ user-guide/features/custom_fsdp
 user-guide/features/dist_optimizer
 user-guide/features/optimizer_cpu_offload
 user-guide/features/pipeline_parallel_layout
+user-guide/features/fine_grained_activation_offloading
 user-guide/features/megatron_energon
 user-guide/features/megatron_rl
 user-guide/features/tokenizers
diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md
index 076409cd4f5..95568adce78 100644
--- a/docs/llama_mistral.md
+++ b/docs/llama_mistral.md
@@ -11,7 +11,7 @@
 
 NOTE: In order to simplify code we now only support converting llama-3.x and mistral checkpoints downloaded from Hugging Face. For converting other models, see [Megatron Bridge](models/index.md).
 
-The [Llama-2](https://ai.meta.com/llama/) and [Llama-3.x](https://llama.meta.com/) family of models are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At their times of release, both Llama-2 and Llama-3 models achieved among the best results for open-source models, and were competitive with leading closed-source models (see https://arxiv.org/pdf/2307.09288.pdf and https://ai.meta.com/blog/meta-llama-3/).
+The Llama-2 and Llama-3.x family of models are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At their times of release, both Llama-2 and Llama-3 models achieved among the best results for open-source models, and were competitive with leading closed-source models (see <https://arxiv.org/pdf/2307.09288.pdf>).
 
 Similarly, [Mistral-7b](https://mistral.ai/news/announcing-mistral-7b/) is an open-source model with pretrained and finetuned (for chat) variants that achieve strong benchmark results.
 
@@ -50,7 +50,6 @@ Architecturally Llama-2, Llama-3 and Mistral-7b are very similar. As such Megatr
 - [Known numerical differences](#known-numerical-differences)
 - [Using legacy model format](#using-legacy-model-format)
 
-
 # Llama-2
 
 Llama-2 checkpoints can be loaded into Megatron for inference and for finetuning. Loading these checkpoints consists of three steps:
@@ -63,7 +62,7 @@ The following sections detail these steps. The final section lists benchmark res
 
 ## Download Meta or Huggingface checkpoints
 
-Users must first apply for access to download the Llama-2 checkpoints either directly from [Meta](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or through [Huggingface](https://huggingface.co/docs/transformers/main/model_doc/llama2) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next.
+Users must first apply for access to download the Llama-2 checkpoints either directly [Huggingface](https://huggingface.co/docs/transformers/main/model_doc/llama2) (HF). The checkpoints are available in two formats, Meta's native format (available from both the Meta and HF links), and HF's format (available only from HF). Either format can be converted to Megatron, as detailed next.
 
 ## Convert checkpoint format
 
@@ -149,11 +148,11 @@ If loading for either inference or finetuning, use the following arguments:
 
 ### Launch Meta
 
-Meta checkpoints can be launched with: https://github.com/facebookresearch/llama
+Meta checkpoints can be launched with: <https://github.com/facebookresearch/llama>
 
 ### Launch Huggingface
 
-Huggingface checkpoints can be launched with: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+Huggingface checkpoints can be launched with: <https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py>
 
 ## Benchmark results
 
@@ -361,7 +360,7 @@ The following sections detail these steps.
 
 ## Download Huggingface checkpoints
 
-Users must first apply for access to download the Mistral-7b checkpoints through [Huggingface](https://huggingface.co/mistralai/Mistral-7B-v0.3) (HF).
+Users must first apply for access to download the Mistral-7b checkpoints through Huggingface. Two variants are available: the base model ([Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3)) and the instruct model ([Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)).
 
 ## Convert checkpoint format
 
@@ -437,7 +436,7 @@ Many models such as Yi-34B and Qwen2.x use the Llama architecture and may be con
 
 It is not expected that the megatron and Huggingface implementations of llama3.x and mistral models will produce numerically identical results. There are multiple points where small numerical differences are expected. This is a non-exhaustive list:
 
-1. TransformerEngine (TE) uses the model params_dtype inside RMSNorm whereas the Huggingface implementation uses fp32. See for details: https://github.com/NVIDIA/TransformerEngine/issues/1132
+1. TransformerEngine (TE) uses the model params_dtype inside RMSNorm whereas the Huggingface implementation uses fp32. See for details: <https://github.com/NVIDIA/TransformerEngine/issues/1132>
 2. Huggingface `transformers` implements the q, k and v projections in self-attention as separate GEMMs whereas Megatron core combines them into a single GEMM for efficiency. This leads to small numerical differences.
 
 # Using legacy model format
diff --git a/docs/user-guide/data-preparation.md b/docs/user-guide/data-preparation.md
index 18da2d80fe1..ea91bee4309 100644
--- a/docs/user-guide/data-preparation.md
+++ b/docs/user-guide/data-preparation.md
@@ -46,6 +46,46 @@ python tools/preprocess_data.py \
 | `--workers` | Number of parallel workers for processing |
 | `--append-eod` | Add end-of-document token |
 
+## Finding Optimal Number of Workers
+
+Use the `--find-optimal-num-workers` flag to find number of workers which gives the best performance in terms of preprocessed documents per second.
+Script will lauch a few short data preprocessing runs with a different number of workers to define the fastest run in respect to collected performance data.
+
+```bash
+python tools/preprocess_data.py \
+    --input data.jsonl \
+    --output-prefix processed_data \
+    --tokenizer-type HuggingFaceTokenizer \
+    --tokenizer-model /path/to/tokenizer.model \
+    --workers 8 \
+    --find-optimal-num-workers \
+    --workers-to-check 4 8 16 32 \
+    --max-documents 50000
+```
+
+**Required arguments**
+
+| Argument | Description |
+|----------|-------------|
+| `--find-optimal-num-workers` | Activates search of optimal number of workers |
+| `--workers-to-check` | List of possible number of workers to run |
+| `--max-documents` | Number of documents to be preprocessed during each run |
+
+**Output example**
+
+```bash
+-----------------------------------
+Performance results (fastest → slowest):
+1. 16 workers → avg. docs/s: 9606.6476
+2. 32 workers → avg. docs/s: 9275.3284
+3. 8 workers → avg. docs/s: 9151.9280
+4. 4 workers → avg. docs/s: 6391.3819
+
+-----------------------------------
+The most optimal num of workers is 16 with avg. preprocessed docs/s: 9606.6476.
+-----------------------------------
+```
+
 ## Output Files
 
 The preprocessing tool generates two files:
diff --git a/docs/user-guide/features/index.md b/docs/user-guide/features/index.md
index fc5a1aa1abe..59cef95d574 100644
--- a/docs/user-guide/features/index.md
+++ b/docs/user-guide/features/index.md
@@ -14,6 +14,7 @@ Advanced feature guides for key Megatron Core capabilities.
 ```{toctree}
 :maxdepth: 2
 
+fine_grained_activation_offloading
 moe
 context_parallel
 custom_fsdp
diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md
index 45e70c3a520..d12f3e35af2 100644
--- a/docs/user-guide/index.md
+++ b/docs/user-guide/index.md
@@ -1,3 +1,7 @@
+---
+orphan: true
+---
+
 <!---
    Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
    NVIDIA CORPORATION and its licensors retain all intellectual property
@@ -14,7 +18,6 @@ Comprehensive guides for using Megatron Core and Megatron-LM.
 ```{toctree}
 :maxdepth: 2
 
-quickstart
 msc_integration
 data-preparation
 training-examples
diff --git a/docs/user-guide/training-examples.md b/docs/user-guide/training-examples.md
index 425728c9e74..5e7c0440073 100644
--- a/docs/user-guide/training-examples.md
+++ b/docs/user-guide/training-examples.md
@@ -33,7 +33,7 @@ This example:
 Train LLaMA-3 8B model with FP8 mixed precision on 8 GPUs:
 
 ```bash
-./examples/llama/train_llama3_8b_fp8.sh
+./examples/llama/train_llama3_8b_h100_fp8.sh
 ```
 
 **Configuration:**
diff --git a/docs/versions1.json b/docs/versions1.json
index ae1809d538e..d55416e0a95 100644
--- a/docs/versions1.json
+++ b/docs/versions1.json
@@ -2,17 +2,16 @@
     {
         "name": "nightly",
         "version": "nightly",
-        "url": "https://docs.nvidia.com/megatron-core/nightly/"
+        "url": "https://docs.nvidia.com/megatron-core/developer-guide/nightly/"
     },
     {
         "name": "0.16.0 (latest)",
         "version": "0.16.0",
-        "url": "https://docs.nvidia.com/megatron-core/latest/"
+        "url": "https://docs.nvidia.com/megatron-core/developer-guide/latest/"
     },
     {
         "name": "0.15.0",
         "version": "0.15.0",
-        "url": "https://docs.nvidia.com/megatron-core/0.15.0/"
+        "url": "https://docs.nvidia.com/megatron-core/developer-guide/0.15.0/"
     }
 ]
-
diff --git a/examples/gptoss/README.md b/examples/gptoss/README.md
new file mode 100644
index 00000000000..eeb92ad9953
--- /dev/null
+++ b/examples/gptoss/README.md
@@ -0,0 +1,153 @@
+# GPT-OSS Training Tutorial
+
+## Step 0: Install Dependencies
+
+### Using Megatron Bridge
+
+[Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)
+
+Megatron Bridge provides a quick and convenient way to convert HuggingFace checkpoints to the Megatron format used by Megatron-LM. Follow the instructions in the [Megatron-Bridge Installation](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/README.md#-installation) to run the nemo docker container and convert checkpoints (via mounted volumes - make sure that the huggingface cache location AND the megatron checkpoint locations are properly mounted, otherwise you may not be saving the converted model to disk correctly).
+
+Below is an example of how to use Megatron-Bridge inside the pytorch container to convert a HuggingFace model checkpoint to Megatron format.
+
+Reference: [Megatron-Bridge Dockerfile](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/docker/Dockerfile.ci)
+
+Inside the [pytorch container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) run the following commands to install Megatron-Bridge:
+```bash
+cd /opt
+git clone --recursive https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
+cd Megatron-Bridge
+
+# Make sure submodules are initialized (for 3rdparty/Megatron-LM)
+git submodule update --init --recursive
+
+export PATH="/root/.local/bin:$PATH"
+export UV_PROJECT_ENVIRONMENT=/opt/venv
+export VIRTUAL_ENV=/opt/venv
+export PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
+export UV_LINK_MODE=copy
+export UV_VERSION="0.7.2"
+
+# Install UV
+curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
+
+# Create virtual environment and build the package
+uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
+
+uv sync --locked --only-group build
+uv sync --locked --link-mode copy --all-extras --all-groups
+
+uv pip install --no-deps -e .
+
+source ${UV_PROJECT_ENVIRONMENT}/bin/activate
+```
+
+### Setup Environment
+
+```bash
+export HOST_MEGATRON_LM_DIR="/path/to/your/host/megatron-lm"
+git clone https://github.com/NVIDIA/Megatron-LM.git "$HOST_MEGATRON_LM_DIR"
+cd "$HOST_MEGATRON_LM_DIR"
+```
+
+```bash
+export HF_TOKEN={your_hf_token_here}
+```
+
+## Step 1: Convert HuggingFace to Megatron (Optional - skip if you already have a Megatron checkpoint)
+
+Set `--nproc-per-node` to be the number of GPUs per node. Set `hf_model_name` to be the Huggingface model e.g. `openai/gpt-oss-20b`
+
+```bash
+python3 -m torch.distributed.launch --nproc-per-node=8 examples/gptoss/01_convert_from_hf.py --hf-model openai/gpt-oss-20b
+```
+
+## Step 2: Train from Scratch
+
+To train from scratch first follow the steps below to setup the environment appropriately before running the training script in docker. Even though we are running the same container as before, it is better to restart the container to ensure a clean environment and that all environment and docker variables are set correctly. For the following example we used 8x GB300, but you should change the number of GPUs and nodes as needed.
+
+### Setup Environment
+
+```bash
+# Change these based on model and directory from previous conversion step
+export MODEL_DIR_NAME="openai_gpt-oss_20b"
+
+export HOST_CHECKPOINT_PATH="./megatron_checkpoints/${MODEL_DIR_NAME}"
+export HOST_TENSORBOARD_LOGS_PATH="./tensorboard_logs/${MODEL_DIR_NAME}"
+```
+
+By default we will use mock data to train the model in the example below. To use your own data, set the following environment variables:
+
+```bash
+# Optional: For real data
+export HOST_TOKENIZER_MODEL_PATH="/path/to/host/tokenizer.model"
+export HOST_DATA_PREFIX="/path/to/host/mydata_prefix"
+```
+
+### Setup Training Configurations
+
+Run the following to create a `distributed_config.env` file with the appropriate distributed training configurations. Change the values as needed for your setup. This file will override the default values in `02_train.sh`.
+
+```bash
+cat > ./distributed_config.env << 'EOF'
+GPUS_PER_NODE=8
+NUM_NODES=1
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NODE_RANK=0
+EOF
+```
+
+### Run Container with Mounted Volumes
+
+**NOTE:** This container runs the example training script `02_train.sh` located in the `examples/gptoss` directory. By default, we have only set pipeline parallelism to be the number of GPUs. Adjust TP_SIZE, EP_SIZE, PP_SIZE, etc. in `02_train.sh`. You can also adjust modify `--hidden-size`, `--ffn-hidden-size`, `--num-attention-heads`, `NUM_LAYERS`, etc. 
+
+To train using mock data, run the following command:
+```bash
+PYTORCH_IMAGE="nvcr.io/nvidia/pytorch:25.12-py3"
+
+docker run --rm --gpus all --ipc=host --ulimit memlock=-1 \
+  -v "${HOST_MEGATRON_LM_DIR}:/workspace/megatron-lm" \
+  -v "${HOST_CHECKPOINT_PATH}:/workspace/checkpoints" \
+  -v "${HOST_TENSORBOARD_LOGS_PATH}:/workspace/tensorboard_logs" \
+  -v "./distributed_config.env:/workspace/megatron-lm/examples/gptoss/distributed_config.env" \
+  --workdir /workspace/megatron-lm \
+  $PYTORCH_IMAGE \
+  bash examples/gptoss/02_train.sh \
+    --checkpoint-path /workspace/checkpoints \
+    --tensorboard-logs-path /workspace/tensorboard_logs \
+    --distributed-config-file /workspace/megatron-lm/examples/gptoss/distributed_config.env \
+  2>&1 | tee "${HOST_TENSORBOARD_LOGS_PATH}/training_mock_$(date +'%y-%m-%d_%H-%M-%S').log"
+```
+**Note:** If you run into issues generating mock data one solution might be to reduce the number of GPUs to 1 and try to generate the data again.
+
+If using real data with with the `HOST_TOKENIZER_MODEL_PATH` and `HOST_DATA_PREFIX` environment variables set, run the following command instead:
+
+```bash
+PYTORCH_IMAGE="nvcr.io/nvidia/pytorch:25.12-py3"
+
+docker run --rm --gpus all --ipc=host --ulimit memlock=-1 \
+  -v "${HOST_MEGATRON_LM_DIR}:/workspace/megatron-lm" \
+  -v "${HOST_CHECKPOINT_PATH}:/workspace/checkpoints" \
+  -v "${HOST_TENSORBOARD_LOGS_PATH}:/workspace/tensorboard_logs" \
+  -v "${HOST_TOKENIZER_MODEL_PATH}:/workspace/tokenizer_model" \
+  -v "$(dirname "${HOST_DATA_PREFIX}"):/workspace/data_dir" \
+  -v "./distributed_config.env:/workspace/megatron-lm/examples/gptoss/distributed_config.env" \
+  --workdir /workspace/megatron-lm \
+  $PYTORCH_IMAGE \
+  bash examples/gptoss/02_train.sh \
+    --checkpoint-path /workspace/checkpoints \
+    --tensorboard-logs-path /workspace/tensorboard_logs \
+    --tokenizer /workspace/tokenizer_model \
+    --data "/workspace/data_dir/$(basename "${HOST_DATA_PREFIX}")" \
+    --distributed-config-file /workspace/megatron-lm/examples/gptoss/distributed_config.env \
+  2>&1 | tee "${HOST_TENSORBOARD_LOGS_PATH}/training_custom_$(date +'%y-%m-%d_%H-%M-%S').log"
+```
+
+## Step 3: Convert Megatron to HuggingFace
+
+Just run the following command to change from the megatron checkpoint from training to the huggingface format to share with others (make sure you have the same virtual environment setup as in Step 0):
+
+```bash
+python3 -m torch.distributed.launch --nproc-per-node=8 examples/gptoss/03_convert_to_hf.py --hf-model openai/gpt-oss-20b --megatron-model ./megatron_checkpoints/openai_gpt-oss_20b
+```
\ No newline at end of file
diff --git a/examples/mamba/README.md b/examples/mamba/README.md
index f8f6d796837..ce60f119ea5 100644
--- a/examples/mamba/README.md
+++ b/examples/mamba/README.md
@@ -43,7 +43,8 @@ set to 1.
 The arguments in the script will need to be changed if using a checkpoint with a
 different model parallel configuration or other differences, such as model
 architecture. For example, to run the 8B pure Mamba-2 model, change
-`--hybrid-attention-ratio` and `--hybrid-mlp-ratio` to 0.0, or remove them.
+`--hybrid-layer-pattern` to use only `M` symbols (e.g., 56 `M`s for the 8B
+model), or remove it entirely.
 
 Use [`run_text_gen_server_8b_gpt3.sh`](./run_text_gen_server_8b_gpt3.sh) to start
 a text generation server using the 8B reference Transformer checkpoint.
@@ -67,24 +68,46 @@ export PYTHONPATH=<path-to-megatron>:PYTHONPATH
 
 ## Hybrid Options
 
-`--hybrid-attention-ratio ATT` specifies a target ratio of attention layers
-to total layers. For example, 4 attention layers out of 48 total layers is
-specified by `--hybrid-attention-ratio 0.08`.
+`--hybrid-layer-pattern PATTERN` specifies the layer type for every layer in
+the model using a string of single-character symbols:
 
-`--hybrid-mlp-ratio MLP` specifies a target ratio of MLP layers to total
-layers. For example, 24 MLP layers out of 48 total layers is specified by
-`--hybrid-mlp-ratio 0.5`.
+* `M` — Mamba layer
+* `*` — Attention layer
+* `-` — MLP layer
+* `E` — MoE layer
 
-* (`ATT` + `MLP`) must be less than or equal to 1.0.
-* (1.0 - `ATT` - `MLP`) is the hybrid mamba ratio, the ratio of mamba layers to
-total layers.
-* `ATT` = `MLP` = 0 is a pure Mamba model.
-* `ATT` = `MLP` = 0.5 is a transfomer model.
+The number of layers is derived from the pattern length, so `--num-layers`
+should not be specified when `--hybrid-layer-pattern` is used.
 
-If either `ATT` or `MLP` is greater than 0.0 or if `--hybrid-override-pattern`
-is specified, the logfile will include information about the hybrid layer
-pattern used. `--hybrid-override-pattern` can be used to specify a different
-pattern than the default, algorithmically-generated one.
+For example, the 8B hybrid model described in the technical report uses:
+
+```
+--hybrid-layer-pattern "M-M-M--M-M*-M-M-M-M--M*-M-M-M-M-M*--M-M-M-M-M*-M--M-M-M-"
+```
+
+This is a 56-layer model with 4 attention layers, 28 MLP layers, and 24 Mamba
+layers.
+
+A pure Mamba model uses only `M` symbols (e.g., `MMMMMMMM` for 8 layers).
+A pure transformer model uses only `*` and `-` symbols.
+
+### Pipeline parallelism
+
+Use `|` to define pipeline stage boundaries for flexible virtual pipeline
+parallelism (fVPP). For example, `M-M-|M-M*-|M-M-|M-M*-` defines 4 pipeline
+segments. The number of segments must be evenly divisible by
+`--pipeline-model-parallel-size`.
+
+### Multi-Token Prediction (MTP)
+
+Use `/` to append MTP layer patterns. Each pattern after the separator
+represents one MTP prediction depth. For example, `M*M*/MM/MM` has main
+pattern `M*M*` with MTP pattern `MM` repeated for 2 depths.
+
+### Deprecated options
+
+`--hybrid-override-pattern`, `--hybrid-attention-ratio`, and
+`--hybrid-mlp-ratio` are deprecated. Use `--hybrid-layer-pattern` instead.
 
 ## Mamba vs Mamba-2
 
diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
index e7fe2e62b8e..824e7979fe9 100644
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -70,7 +70,7 @@ examples/multimodal/combine_lm_vision_checkpoints.sh /path/to/mistral/model /pat
     ```
     > Please enter a desired train/val/test split like "0.5, 0.2, 0.3" or "8,1,1": 9,1,0
     > Do you want to create a dataset.yaml interactively? [Y/n]: Y
-    > Please enter a number to choose a class: 10 (VQAWebdataset)
+    > Please enter a number to choose a class: 9 (VQASample)
     > Do you want to set a simple field_map[Y] (or write your own sample_loader [n])? [Y/n]: Y
     > Please enter a webdataset field name for 'image' (<class 'torch.Tensor'>): jpg
     > Please enter a webdataset field name for 'context' (<class 'str'>): json[0][value]
diff --git a/examples/post_training/modelopt/README.md b/examples/post_training/modelopt/README.md
index 93b5022b2aa..8815e22b831 100644
--- a/examples/post_training/modelopt/README.md
+++ b/examples/post_training/modelopt/README.md
@@ -34,6 +34,7 @@ knowledge distillation, pruning, speculative decoding, and more.
 | `moonshotai/Kimi-K2-Instruct` | ✅ | ✅ | - | - |
 | `nvidia/NVIDIA-Nemotron-Nano-9B-v2` | ✅ | - | ✅ | ✅ |
 | `nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16` | ✅ | - | ✅ | ✅ |
+| `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` | ✅ | - | ✅ | ✅ |
 | `openai/gpt-oss-{20b, 120b}` | ✅ | **Online** | ✅ | ✅ |
 | `Qwen/Qwen3-{0.6B, 8B}` | ✅ | ✅ | ✅ | ✅ |
 | `Qwen/Qwen3-{30B-A3B, 235B-A22B}` | **WAR** | ✅ | ✅ | ✅ |
diff --git a/megatron/core/MSC_Integration.md b/megatron/core/MSC_Integration.md
index d0c93a80129..da8b5c982b8 100644
--- a/megatron/core/MSC_Integration.md
+++ b/megatron/core/MSC_Integration.md
@@ -1,4 +1,4 @@
-## Multi-Storage Client (MSC) Integration
+# Multi-Storage Client (MSC) Integration
 
 The [Multi-Storage Client](https://github.com/NVIDIA/multi-storage-client) (MSC) provides a unified interface for reading datasets and storing checkpoints from both filesystems (e.g., local disk, NFS, Lustre) and object storage providers such as S3, GCS, OCI, Azure, AIStore, and SwiftStack.
 
@@ -8,7 +8,7 @@ This guide will walk you through how to:
 2. How to train models directly using datasets in object storage
 3. How to save and load model checkpoints to/from object storage
 
-### Installation
+## Installation
 
 MSC is vended as `the multi-storage-client` package on PyPI.
 
@@ -25,7 +25,7 @@ pip install "multi-storage-client[boto3]"
 pip install "multi-storage-client[google-cloud-storage]"
 ```
 
-### Configuration File
+## Configuration File
 
 MSC uses a YAML configuration file to define how it connects to object storage systems. This design allows you to specify one or more storage profiles, each representing a different storage backend or bucket. MSC keeps your training scripts clean and portable by centralizing details in a config file. There is no need to hardcode access keys, bucket names, or other provider-specific options directly into your code.
 
@@ -58,7 +58,7 @@ To tell MSC where to find this file, set the following environment variable befo
 export MSC_CONFIG=/path/to/msc_config.yaml
 ```
 
-### MSC URL Format
+## MSC URL Format
 
 MSC uses a custom URL scheme to identify and access files across different object storage providers. This scheme makes it easy to reference data and checkpoints without worrying about the underlying storage implementation. An MSC URL has the following structure:
 
@@ -96,7 +96,7 @@ is interpreted as accessing the object with the key `dataset/train/data.bin` ins
 This abstraction allows training scripts to reference storage resources uniformly—whether they're hosted on AWS, GCP, Oracle, or Azure—just by switching profiles in the config file.
 
 
-### Train from Object Storage
+## Train from Object Storage
 
 To train with datasets stored in object storage, use an MSC URL with the `--data-path` argument. This URL references a dataset stored under a profile defined in your MSC configuration file.
 
@@ -112,7 +112,7 @@ python pretrain_gpt.py                                      \
 
 **NOTE:** All four arguments must be provided when training with datasets in object storage using MSC.
 
-### Save and Load Checkpoints from Object Storage
+## Save and Load Checkpoints from Object Storage
 
 MSC can be used to save and load model checkpoints directly from object storage by specifying MSC URLs for the `--save` and `--load` arguments. This allows you to manage checkpoints in object storage.
 
@@ -125,7 +125,7 @@ python pretrain_gpt.py                \
 
 **Notes:** Only the `torch_dist` checkpoint format is currently supported when saving to or loading from MSC URLs.
 
-### Disable MSC
+## Disable MSC
 
 By default, MSC integration is automatically enabled when the `multi-storage-client` library is installed. MSC is also used for regular filesystem paths (like `/filesystem_mountpoint/path` in `--data-path`, `--save`, or `--load`) even when not using explicit MSC URLs. MSC functions as a very thin abstraction layer with negligible performance impact when used with regular paths, so there's typically no need to disable it. If you need to disable MSC, you can do so using the `--disable-msc` flag:
 
@@ -133,7 +133,7 @@ By default, MSC integration is automatically enabled when the `multi-storage-cli
 python pretrain_gpt.py --disable-msc
 ```
 
-### Performance Considerations
+## Performance Considerations
 
 When using object storage with MSC, there are a few important performance implications to keep in mind:
 
@@ -165,7 +165,7 @@ cache:
 
 For optimal performance, configure the cache directory on a high-speed local storage device such as an NVMe SSD.
 
-### Additional Resources and Advanced Configuration
+## Additional Resources and Advanced Configuration
 
 Refer to the [MSC Configuration Documentation](https://nvidia.github.io/multi-storage-client/references/configuration.html) for complete documentation on MSC configuration options, including detailed information about supported storage providers, credentials management, and advanced caching strategies.
 
diff --git a/megatron/core/QuickStart.md b/megatron/core/QuickStart.md
index 6b38f58a2bd..2a2744cf43e 100644
--- a/megatron/core/QuickStart.md
+++ b/megatron/core/QuickStart.md
@@ -21,8 +21,8 @@ This guide for Megatron Core walks you through the following tasks:
 
     git clone https://github.com/NVIDIA/Megatron-LM.git
     cd Megatron-LM
-    pip install -U "setuptools<80.0.0,>=77.0.0" packaging
-    pip install --no-build-isolation .[dev]
+    pip install uv
+    uv pip install -e .
     ```
 
     <br>
diff --git a/megatron/core/README.md b/megatron/core/README.md
index a9134be41cd..7260a9fc4b9 100644
--- a/megatron/core/README.md
+++ b/megatron/core/README.md
@@ -9,8 +9,8 @@ Megatron Core
 ## ⚡ Quick Start
 
 ```bash
-# Install Megatron Core with required dependencies
-pip install --no-build-isolation megatron-core[dev]
+# Install Megatron Core
+uv pip install megatron-core
 
 # Distributed training example (2 GPUs, mock data)
 torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py
diff --git a/megatron/core/distributed/fsdp/src/README.md b/megatron/core/distributed/fsdp/src/README.md
index 75cb7c45613..0b8339a7179 100644
--- a/megatron/core/distributed/fsdp/src/README.md
+++ b/megatron/core/distributed/fsdp/src/README.md
@@ -129,6 +129,7 @@ import torch
 from megatron_fsdp import (
     fully_shard_model,
     fully_shard_optimizer,
+    MixedPrecisionPolicy,
 )
 ```
 
@@ -196,10 +197,8 @@ model = fully_shard_model(
     outer_dp_sharding_strategy=1,
     # Initialize the model on devices in shards to avoid OOM. Requires device("meta")-init for model.
     init_model_with_meta_device=True,
-    # Reduce gradients in FP32.
-    grad_reduce_in_fp32=False,
-    # Store distributed optimization state in FP32.
-    preserve_fp32_weights=True,
+    # Mixed-Precision Policy for controlling compute and communication precision in Megatron-FSDP.
+    mixed_precision_policy=MixedPrecisionPolicy(),
     # Sync parameters and gradients each step. Allows for gradient transformations after backward pass,
     # and synchronizes parameters and gradients across HSDP groups, but deactivates compute-communication
     # overlap going into the subsequent training step.
@@ -285,7 +284,7 @@ Megatron-FSDP's `fully_shard_*` API has a comprehensive set of arguments for fin
 
 - `fsdp_unit_modules` is a list of sub-module classes or `str` import-paths associated with modules that you want `MegatronFSDP` to fully-shard.
   - Required if `1`, `2`, or `3` are specified as the sharding strategy. Defaults to `None`, in which case Megatron-FSDP will replicate the parameters similar to DDP.
-- `zero_dp_strategy` (and `outer_dp_sharding_strategy`) configure different degrees of zero-redundancy data parallelism as described in [ZeRO (Zero Redundancy Optimizer)](https://arxiv.org/abs/1910.02054). It reduces CUDA memory utilization during model training by distributing model parameters, gradients, and optimizer states across multiple devices in the DP `ProcessGroup`, and collectively communicating subsets of parameters and gradients to specific devices when needed for computation or differentiation. More aggressive sharding strategies will entail more communication overhead, with `no_shard` being the least memory efficient but most communication efficient, and `optim_grads_params` being the most memory efficient but least communication efficient. `outer_dp_sharding_strategy` has the same options, except for the (required) "outer" DP group (`dp_outer_dim` / `hybrid_fsdp_group`) when using [Hybrid-Sharded Data Parallelism (HSDP)](https://arxiv.org/pdf/2304.11277), and only `no_shard` (DP Replication) and `optim` (Optimizer State Hybrid Sharding, requires `zero_dp_strategy='optim_grads_params`) are supported.
+- `zero_dp_strategy` (and `outer_dp_sharding_strategy`) configure different degrees of zero-redundancy data parallelism as described in [ZeRO (Zero Redundancy Optimizer)](https://arxiv.org/abs/1910.02054). It reduces CUDA memory utilization during model training by distributing model parameters, gradients, and optimizer states across multiple devices in the DP `ProcessGroup`, and collectively communicating subsets of parameters and gradients to specific devices when needed for computation or differentiation. More aggressive sharding strategies will entail more communication overhead, with `no_shard` being the least memory efficient but most communication efficient, and `optim_grads_params` being the most memory efficient but least communication efficient. Additionally, `outer_dp_sharding_strategy` supports `no_shard` ([Hybrid-Sharded Data Parallelism (HSDP)](https://arxiv.org/pdf/2304.11277)) and `optim` (`HFSDP` = Fully-Sharded Optimizer State + _HSDP_, requires `zero_dp_strategy='optim_grads_params'`), after specifying the "outer" DP group (`dp_outer_dim` / `hybrid_fsdp_group`).
   - Default: `optim_grads_params` or `3` for `zero_dp_strategy` and `no_shard` or `0` for `outer_dp_sharding_strategy`
   - `0` or `no_shard` implies that your model is not sharded. Similar memory usage to `DDP`.
   - `1` or `optim` implies that your optimizer state is sharded for distributed optimization. Similar to optimizer state sharding in `ZeRO-DP`.
@@ -305,16 +304,25 @@ Megatron-FSDP's `fully_shard_*` API has a comprehensive set of arguments for fin
 - `init_model_with_meta_device` has `MegatronFSDP` initialize your `meta`-device model in shards on every CUDA device to avoid OOM when initializing extremely large models that cannot fit on a single device. Users can initialize their model on a [`meta`-device](https://docs.pytorch.org/docs/stable/meta.html) (`with torch.device('meta'): ...`), and ``MegatronFSDP`` will further shard and initialize the model parameters layer-by-layer adhering to the customizable `module.reset_parameters` method, which prevents the entire model from being allocated in memory at any point during runtime.
     - Defaults to `False`.
     - Note that the `device` argument which installs your model on a specific device or rank will be deactivated when `init_model_with_meta_device=True`.
-- `grad_reduce_in_fp32` will reduce gradients in `FP32` precision (in contrast to the lower `BF16` or `FP8` model training precision).
-    - Defaults to `False`.
-    - `torch.distributed.fsdp.MixedPrecisionPolicy` will be supported in the near future.
-- `preserve_fp32_weights` will preserve a `FP32` precision version of model parameters utilized for optimization.
-    - Defaults to `True`.
-    - `torch.distributed.fsdp.MixedPrecisionPolicy` will be supported in the near future.
+- `mixed_precision_policy` takes a `megatron_fsdp.MixedPrecisionPolicy` that configures mixed-precision compute and communication for Megatron-FSDP. Configuration options include:
+    - `main_params_dtype` controls the data-type for parameters used in distributed optimization or quantization. 
+        - Defaults to `torch.float32`.
+        - If set to `None`, the native model compute parameter data-type will be utilized.
+        - Requires specification (cannot be `None`) when using `FP8` parameters with Megatron-FSDP.
+    - `main_grads_dtype` controls the data-type for gradients used in distributed optimization.
+        - Defaults to `torch.float32`, which is highly-recommended for accuracy at scale.
+        - If set to `None`, the model native gradient data-type will be utilized.
+    - `grad_comm_dtype` controls the data-type for gradient communications (RS / AR) when reducing gradients. Lower precision `grad_comm_dtype` improves (communication) performance, but may increase memory utilization or sacrifice gradient precision in certain cases.
+        - Defaults to `torch.float32`.
+        - If set to `None`, the `main_grads_dtype` data-type will be utilized.
+        - If using `no_shard`, `optim`, or a `FixedPoolAllocator` (`fsdp_double_buffer`), allocating `dtype`-custom gradient communication buffers (per FSDP group) adds memory overhead.
+        - If using NCCL UBR v2.27+ (`nccl_ub=True`), gradient reduction may be performed in high-precision depending on the network domain (NVLink or IB), and can enable mixed-precision communication and accumulation, e.g. setting grad_comm_dtype to BF16 can support FP32 reduction even though we have BF16 input and output communication buffers. Otherwise, gradients will be reduced in `grad_comm_dtype` (and accumulated in `main_grads_dtype`) as usual.
 - `overlap_grad_reduce` and `overlap_param_gather` will overlap gradient [`reduce-scatter`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#reducescatter) and parameter [`all-gather`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allgather) group communications with backward and forward compute with asynchronous calls and pre-fetching. (In the case of `no_shard`, parameters are not gathered but gradient [`all-reduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allreduce) is overlapped.)
     - Both default to `True`.
-- `sync_model_each_microbatch` will trigger a `wait` (`MegatronFSDP.finish_grad_sync()`) on gradient reduction, parameter de-allocation, and optimizer parameter / gradient installation (in preparation for `optimizer.step()`) after every forward-backward pass. When using HSDP, parameters and gradients will be all-gathered and reduced respectively on the "outer" DP group each training step instead of each optimization cycle. This behavior is desirable for a transparent and user-friendly sharded training loop where post-backward transformations on the gradient and a clean compute / memory state are necessary between training iterations, but damages performance in situations where optimization is delayed (e.g. gradient accumulation) where the communications of the previous training iteration can be overlapped with the compute of the next training iteration. Will also override `is_last_microbatch` / `microbatch_count` logic in `MegatronFSDP`.
+- `sync_model_each_microbatch` will trigger a `wait` (`MegatronFSDP.finish_grad_sync()`) on gradient reduction, parameter de-allocation, and optimizer parameter / gradient installation (in preparation for `optimizer.step()`) after every forward-backward pass. When using HSDP, parameters and gradients will be all-gathered and reduced respectively on the "outer" DP group each training step instead of each optimization cycle. This behavior is desirable for a transparent and user-friendly sharded training loop where post-backward transformations on the gradient and a clean compute / memory state are necessary within and between training iterations, but damages performance in situations where optimization is delayed (e.g. gradient accumulation) when the communications of the previous training iteration can be overlapped with the compute of the next training iteration. Will also override `is_last_microbatch` / `microbatch_count` logic in `MegatronFSDP`.
     - Defaults to `True` for `fully_shard`, but defaults to `False` when using the `MegatronFSDP` class directly.
+    - Can also be controlled with the `MegatronFSDP.sync()` context manager, or through invoking `MegatronFSDP.set_model_auto_sync(bool)`.
+    - WARNING: When this synchronization feature is activated in conjunction with `no_shard` / `0` or `optim` / `1` sharding strategies, the user is responsible for calling `MegatronFSDP.zero_grad_buffer()` or `optimizer.zero_grad()` after the subsequent forward-backward pass. This is because un-sharded gradients are all-reduced directly into the gradient accumulation buffer, and this buffer should not be all-reduced more than once per optimization cycle! Analogous to the justification for the [`no_sync()` API for PyTorch DistributedDataParallel](https://docs.pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.no_sync).
 - `enable_fine_grained_param_gather` modifies FSDP to all-gather parameters with per-Module granularity instead of collectively unsharding all sub-modules of a unit module in Megatron-FSDP.
     - Defaults to `False`.
 - `keep_fp8_transpose_cache` will keep the fp8 transpose cache when using `MegatronFSDP`. This option will cause (number of parameter $\times$ 1 Byte) of memory overhead, but can skip the weight transpose operation in the backward propagation. This feature will not give any benefit from the Blackwell architecture.
@@ -362,8 +370,10 @@ mfsdp_model = fully_shard_model(
     fsdp_unit_modules=[te.pytorch.TransformerLayer],
     # Only FSDP / ZeRO-3 supports FP8 parameters.
     zero_dp_strategy=3,
-    # Needed for FP8 parameters. (Default is already True.)
-    preserve_fp32_weights=True,
+    # FP32 main weights needed for FP8 parameters.
+    mixed_precision_policy=MixedPrecisionPolicy(
+        main_params_dtype=torch.float32
+    ),
     # Needed for select FP8 recipes.
     keep_fp8_transpose_cache=True,
 )
diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md
index e5eff445cac..558e2ccd39a 100644
--- a/megatron/core/transformer/moe/README.md
+++ b/megatron/core/transformer/moe/README.md
@@ -687,7 +687,7 @@ torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
 
 ## Contributing
 
-We welcome contributions! Please see [CONTRIBUTING.md](../../../../CONTRIBUTING.md) for guidelines.
+We welcome contributions! Please see [CONTRIBUTING.md](https://github.com/NVIDIA/Megatron-LM/blob/main/CONTRIBUTING.md) for guidelines.
 
 ## Support
 
diff --git a/tools/trigger_internal_ci.md b/tools/trigger_internal_ci.md
new file mode 100644
index 00000000000..340a9ff9bfb
--- /dev/null
+++ b/tools/trigger_internal_ci.md
@@ -0,0 +1,76 @@
+# Trigger Internal CI
+
+:warning: This is only useful to NVIDIANs.
+
+Pushes the current branch to the internal GitLab remote and triggers a CI
+pipeline — without touching the GitLab UI.
+
+## Prerequisites
+
+**1. Add the internal GitLab as a git remote** (skip if you already have one configured):
+
+```bash
+git remote add gitlab git@<gitlab-hostname>:ADLR/Megatron-LM.git
+```
+
+To check existing remotes: `git remote -v`
+
+**The name of the origin will be required later!**
+
+**2. Obtain a personal access token:**
+
+1. Open your internal GitLab profile: **User menu → Edit profile → Access tokens**.
+2. Click **Add new token**, give it a description, set an expiry, and select the **`api`** scope.
+3. Click **Create personal access token** and copy the generated token (starts with `glpat-`).
+4. Store it in your environment to avoid passing it on every invocation:
+
+```bash
+export GITLAB_TOKEN=glpat-<your-token>
+```
+
+**Tip: Store this in your .env or .bashrc file**
+
+## Usage
+
+```bash
+python tools/trigger_internal_ci.py \
+  --gitlab-origin gitlab \
+  [--access-token glpat-<your-token>] \
+  [--functional-test-scope mr] \
+  [--functional-test-repeat 5] \
+  [--functional-test-cases all] \
+  [--dry-run]
+```
+
+| Argument | Default | Description |
+|---|---|---|
+| `--gitlab-origin` | *(required)* | Git remote name for the internal GitLab |
+| `--access-token` | `$GITLAB_TOKEN` | Personal access token with `api` scope |
+| `--functional-test-scope` | `mr` | `FUNCTIONAL_TEST_SCOPE` pipeline variable |
+| `--functional-test-repeat` | `5` | `FUNCTIONAL_TEST_REPEAT` pipeline variable |
+| `--functional-test-cases` | `all` | `FUNCTIONAL_TEST_CASES` pipeline variable |
+| `--dry-run` | off | Print what would happen without pushing or triggering |
+
+## Example
+
+```bash
+# Dry run — no push, no trigger
+python tools/trigger_internal_ci.py --gitlab-origin gitlab --dry-run
+
+# Real run — uses token from environment
+python tools/trigger_internal_ci.py --gitlab-origin gitlab
+```
+
+## Expected behavior
+
+```
+Current branch: my-feature-branch
+Everything up-to-date
+Triggering pipeline on https://<gitlab-hostname> project 19378 @ pull-request/my-feature-branch
+Pipeline triggered: https://<gitlab-hostname>/<namespace>/<project>/-/pipelines/123456
+```
+
+1. The current branch is detected from git.
+2. The branch is force-pushed to the GitLab remote as `pull-request/<branch>`.
+3. A pipeline is triggered on that ref with the configured test variables.
+4. The URL of the newly created pipeline is printed.
diff --git a/tools/trigger_internal_ci.py b/tools/trigger_internal_ci.py
new file mode 100644
index 00000000000..4b0c19f09bd
--- /dev/null
+++ b/tools/trigger_internal_ci.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""CLI tool to trigger the internal GitLab CI pipeline from a local branch.
+
+Pushes the current branch to the internal GitLab remote under the
+pull-request/<branch> naming convention and triggers a pipeline with
+the specified test configuration.
+
+Requires a GitLab personal access token with at least the 'api' scope.
+Set the GITLAB_TOKEN environment variable or pass --access-token.
+"""
+
+import argparse
+import logging
+import os
+import subprocess
+import sys
+from urllib.parse import urlparse
+
+import gitlab  # python-gitlab
+
+GITLAB_PROJECT_ID = 19378
+GITLAB_BRANCH_PREFIX = "pull-request"
+
+PIPELINE_VARIABLES_FIXED = {
+    "UNIT_TEST": "no",
+    "INTEGRATION_TEST": "no",
+}
+
+logger = logging.getLogger(__name__)
+
+
+def get_remote_url(origin):
+    """Return the fetch URL configured for the given git remote name."""
+    result = subprocess.run(
+        ["git", "remote", "get-url", origin],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    return result.stdout.strip()
+
+
+def get_gitlab_hostname(remote_url):
+    """Extract the hostname (without port) from an SSH or HTTPS remote URL."""
+    if remote_url.startswith("git@"):
+        hostname = remote_url.split("@", 1)[1].split(":")[0]
+    else:
+        hostname = urlparse(remote_url).hostname
+    return hostname.split(":")[0]
+
+
+def get_current_branch():
+    """Return the name of the currently checked-out git branch."""
+    result = subprocess.run(
+        ["git", "rev-parse", "--abbrev-ref", "HEAD"],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    return result.stdout.strip()
+
+
+def git_push(origin, target_branch, dry_run=False):
+    """Force-push HEAD to the given branch on the named git remote."""
+    if dry_run:
+        logger.info("[DRY RUN] Would push HEAD to remote '%s' as %s", origin, target_branch)
+        return
+    subprocess.run(
+        ["git", "push", origin, f"HEAD:{target_branch}", "--force"],
+        check=True,
+    )
+
+
+def trigger_pipeline(gitlab_url, access_token, ref, pipeline_vars, dry_run=False):
+    """Trigger a GitLab pipeline on the given ref with the provided variables."""
+    if dry_run:
+        logger.info(
+            "[DRY RUN] Would trigger pipeline on https://%s project %s @ %s",
+            gitlab_url,
+            GITLAB_PROJECT_ID,
+            ref,
+        )
+        return
+    logger.info(
+        "Triggering pipeline on https://%s project %s @ %s", gitlab_url, GITLAB_PROJECT_ID, ref
+    )
+    gl = gitlab.Gitlab(f"https://{gitlab_url}", private_token=access_token)
+    project = gl.projects.get(GITLAB_PROJECT_ID, lazy=True)
+    variables = [{"key": k, "value": v} for k, v in pipeline_vars.items()]
+    pipeline = project.pipelines.create({"ref": ref, "variables": variables})
+    logger.info("Pipeline triggered: %s", pipeline.web_url)
+
+
+def main():
+    """Parse arguments and orchestrate the push and pipeline trigger flow."""
+    parser = argparse.ArgumentParser(
+        description="Trigger the internal GitLab CI pipeline for the current branch."
+    )
+    parser.add_argument(
+        "--gitlab-origin",
+        required=True,
+        help="Name of the git remote pointing to the internal GitLab (e.g. gitlab)",
+    )
+    parser.add_argument(
+        "--access-token",
+        default=os.environ.get("GITLAB_TOKEN"),
+        help="GitLab personal access token with 'api' scope (or set GITLAB_TOKEN env var)",
+    )
+    parser.add_argument(
+        "--functional-test-scope",
+        default="mr",
+        help="FUNCTIONAL_TEST_SCOPE pipeline variable (default: mr)",
+    )
+    parser.add_argument(
+        "--functional-test-repeat",
+        type=int,
+        default=5,
+        help="FUNCTIONAL_TEST_REPEAT pipeline variable (default: 5)",
+    )
+    parser.add_argument(
+        "--functional-test-cases",
+        default="all",
+        help="FUNCTIONAL_TEST_CASES pipeline variable (default: all)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print actions without executing git push or pipeline trigger",
+    )
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    if not args.access_token:
+        logger.error("--access-token or GITLAB_TOKEN not set")
+        sys.exit(1)
+
+    branch = get_current_branch()
+    logger.info("Current branch: %s", branch)
+
+    remote_url = get_remote_url(args.gitlab_origin)
+    gitlab_hostname = get_gitlab_hostname(remote_url)
+
+    target_branch = f"{GITLAB_BRANCH_PREFIX}/{branch}"
+
+    git_push(args.gitlab_origin, target_branch, dry_run=args.dry_run)
+
+    pipeline_vars = {
+        **PIPELINE_VARIABLES_FIXED,
+        "FUNCTIONAL_TEST_SCOPE": args.functional_test_scope,
+        "FUNCTIONAL_TEST_REPEAT": str(args.functional_test_repeat),
+        "FUNCTIONAL_TEST_CASES": args.functional_test_cases,
+    }
+
+    trigger_pipeline(
+        gitlab_hostname, args.access_token, target_branch, pipeline_vars, dry_run=args.dry_run
+    )
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file