feat: add Qwen3 dense model handler for NeMo Automodel

LoganVegnaSHOP · LoganVegnaSHOP · commit b69710ce2d2d · 2026-03-02T19:11:35.000-05:00
Adds a custom Qwen3ForCausalLM implementation that supports TE backend
and context parallelism for dense Qwen3 models (e.g. Qwen3-14B).

Uses the same attention with per-head QK RMSNorm as the existing Qwen3
MoE handler, with a standard SwiGLU MLP instead of MoE layers.

Registers Qwen3ForCausalLM in the model architecture mapping so NeMo
routes it through the custom model path instead of falling back to
vanilla HuggingFace (which doesn't support backend/CP).

Made-with: Cursor
diff --git a/nemo_automodel/_transformers/registry.py b/nemo_automodel/_transformers/registry.py
@@ -92,6 +92,10 @@
             "Qwen2ForCausalLM",
             ("nemo_automodel.components.models.qwen2.model", "Qwen2ForCausalLM"),
         ),
+        (
+            "Qwen3ForCausalLM",
+            ("nemo_automodel.components.models.qwen3.model", "Qwen3ForCausalLM"),
+        ),
         (
             "Qwen3MoeForCausalLM",
             ("nemo_automodel.components.models.qwen3_moe.model", "Qwen3MoeForCausalLM"),
diff --git a/nemo_automodel/components/models/qwen3/__init__.py b/nemo_automodel/components/models/qwen3/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo_automodel/components/models/qwen3/layers.py b/nemo_automodel/components/models/qwen3/layers.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import torch
+from torch import nn
+
+from nemo_automodel.components.attention.utils import (
+    initialize_attn_module_and_func,
+    postprocess_output_for_attn,
+    preprocess_args_and_kwargs_for_attn,
+)
+from nemo_automodel.components.models.common import (
+    BackendConfig,
+    initialize_linear_module,
+    initialize_rms_norm_module,
+)
+from nemo_automodel.components.models.gpt_oss.rope_utils import apply_rotary_emb_qk
+
+
+class Qwen3Attention(nn.Module):
+    """Qwen3 dense attention with per-head QK RMSNorm and RoPE.
+
+    Identical to the Qwen3 MoE attention layer — the attention mechanism
+    is shared between dense and MoE variants.
+    """
+
+    def __init__(self, config, backend: BackendConfig):
+        super().__init__()
+        self.backend = backend
+
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // self.num_heads)
+
+        attention_bias = getattr(config, "attention_bias", False)
+
+        self.q_proj = initialize_linear_module(
+            backend.linear, config.hidden_size, self.num_heads * self.head_dim, attention_bias
+        )
+        self.k_proj = initialize_linear_module(
+            backend.linear, config.hidden_size, self.num_kv_heads * self.head_dim, attention_bias
+        )
+        self.v_proj = initialize_linear_module(
+            backend.linear, config.hidden_size, self.num_kv_heads * self.head_dim, attention_bias
+        )
+        self.o_proj = initialize_linear_module(
+            backend.linear, self.num_heads * self.head_dim, config.hidden_size, attention_bias
+        )
+
+        self.q_norm = initialize_rms_norm_module(backend.rms_norm, self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = initialize_rms_norm_module(backend.rms_norm, self.head_dim, eps=config.rms_norm_eps)
+
+        softmax_scale = self.head_dim**-0.5
+        self.attn_module, self.attn_func = initialize_attn_module_and_func(
+            attn_impl=backend.attn,
+            num_attention_heads=self.num_heads,
+            num_qk_channels=self.head_dim,
+            num_v_channels=self.head_dim,
+            softmax_scale=softmax_scale,
+            num_gqa_groups=self.num_kv_heads,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        *,
+        freqs_cis: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        **attn_kwargs: Any,
+    ) -> torch.Tensor:
+        if len(x.shape) == 2:
+            qkv_format = "thd"
+            num_tokens = x.shape[0]
+        else:
+            qkv_format = "bshd"
+            bsz, seqlen, _ = x.size()
+
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+
+        if qkv_format == "thd":
+            q = q.view(num_tokens, self.num_heads, self.head_dim)
+            k = k.view(num_tokens, self.num_kv_heads, self.head_dim)
+            v = v.view(num_tokens, self.num_kv_heads, self.head_dim)
+        else:
+            q = q.view(bsz, seqlen, self.num_heads, self.head_dim)
+            k = k.view(bsz, seqlen, self.num_kv_heads, self.head_dim)
+            v = v.view(bsz, seqlen, self.num_kv_heads, self.head_dim)
+
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+
+        q, k = apply_rotary_emb_qk(
+            q,
+            k,
+            freqs_cis,
+            format=qkv_format,
+            rope_fusion=self.backend.rope_fusion,
+            cu_seqlens=attn_kwargs.get("cu_seqlens", None),
+            cp_size=attn_kwargs.get("cp_size", 1),
+            cp_rank=attn_kwargs.get("cp_rank", 0),
+        )
+
+        q, k, v, _attn_kwargs = preprocess_args_and_kwargs_for_attn(
+            q, k, v, attention_mask, self.backend.attn, **attn_kwargs
+        )
+        out = self.attn_func(q, k, v, **_attn_kwargs)
+        out = postprocess_output_for_attn(out, self.backend.attn)
+
+        flatten_dim = 2 if qkv_format == "bshd" else 1
+        out = self.o_proj(out.flatten(flatten_dim))
+        return out
+
+    def init_weights(self, buffer_device: torch.device, init_std: float = 0.02):
+        for linear in [self.q_proj, self.k_proj, self.v_proj, self.o_proj]:
+            nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std)
+            if hasattr(linear, "bias") and linear.bias is not None:
+                nn.init.zeros_(linear.bias)
+        for norm in (self.q_norm, self.k_norm):
+            norm.reset_parameters()
diff --git a/nemo_automodel/components/models/qwen3/model.py b/nemo_automodel/components/models/qwen3/model.py