NVIDIA-NeMo · ananthsub · Aug 11, 2025 · Aug 12, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/src/megatron/bridge/models/__init__.py b/src/megatron/bridge/models/__init__.py
@@ -72,6 +72,14 @@
     Qwen25ModelProvider72B,
     Qwen25ModelProvider500M,
 )
+from megatron.bridge.models.starcoder import (
+    Starcoder2ModelProvider,
+    Starcoder2ModelProvider3B,
+    Starcoder2ModelProvider7B,
+    Starcoder2ModelProvider15B,
+    StarcoderModelProvider,
+    StarcoderModelProvider15B,
+)
 from megatron.bridge.models.t5_provider import T5ModelProvider
 
 
@@ -130,4 +138,10 @@
     "Qwen3MoEModelProvider",
     "Qwen3MoEModelProvider30B_A3B",
     "Qwen3MoEModelProvider235B_A22B",
+    "Starcoder2ModelProvider",
+    "Starcoder2ModelProvider3B",
+    "Starcoder2ModelProvider7B",
+    "Starcoder2ModelProvider15B",
+    "StarcoderModelProvider",
+    "StarcoderModelProvider15B",
 ]
diff --git a/src/megatron/bridge/models/starcoder/__init__.py b/src/megatron/bridge/models/starcoder/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.bridge.models.starcoder.starcoder2_provider import (
+    Starcoder2ModelProvider,
+    Starcoder2ModelProvider3B,
+    Starcoder2ModelProvider7B,
+    Starcoder2ModelProvider15B,
+)
+from megatron.bridge.models.starcoder.starcoder_provider import (
+    StarcoderModelProvider,
+    StarcoderModelProvider15B,
+)
+
+
+__all__ = [
+    "StarcoderModelProvider",
+    "StarcoderModelProvider15B",
+    "Starcoder2ModelProvider",
+    "Starcoder2ModelProvider3B",
+    "Starcoder2ModelProvider7B",
+    "Starcoder2ModelProvider15B",
+]
diff --git a/src/megatron/bridge/models/starcoder/starcoder2_provider.py b/src/megatron/bridge/models/starcoder/starcoder2_provider.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Callable, List, Optional
+
+import torch.nn.functional as F
+
+from megatron.bridge.models.gpt_provider import GPTModelProvider
+
+
+@dataclass
+class Starcoder2ModelProvider(GPTModelProvider):
+    """
+    Model Provider class for Starcoder2, inheriting from GPTModelProvider.
+    """
+
+    # configs that are common across model sizes
+    normalization: str = "LayerNorm"
+    activation_func: Callable = F.gelu
+    add_bias_linear: bool = True
+    seq_length: int = 16384
+    position_embedding_type: str = "rope"
+    rotary_percent: float = 1.0
+    hidden_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    init_method_std: float = 0.01
+    share_embeddings_and_output_weights: bool = False
+    kv_channels: int = None
+    num_query_groups: int = None
+    window_size: Optional[List[int]] = None
+    attention_softmax_in_fp32: bool = True
+    bias_activation_fusion: bool = True
+    bias_dropout_fusion: bool = True
+    layernorm_epsilon: float = 1e-5
+
+
+@dataclass
+class Starcoder2ModelProvider3B(Starcoder2ModelProvider):
+    """
+    Model Provider for the Starcoder2 3B, inheriting from Starcoder2ModelProvider.
+    """
+
+    num_layers: int = 30
+    hidden_size: int = 3072
+    ffn_hidden_size: int = 12288
+    num_query_groups: int = 2
+    num_attention_heads: int = 24
+    init_method_std: float = 0.018042
+    rotary_base: float = 999999.4420358813
+
+
+@dataclass
+class Starcoder2ModelProvider7B(Starcoder2ModelProvider):
+    """
+    Model Provider for the Starcoder2 7B, inheriting from Starcoder2ModelProvider.
+    """
+
+    num_layers: int = 32
+    hidden_size: int = 4608
+    ffn_hidden_size: int = 18432
+    num_query_groups: int = 4
+    num_attention_heads: int = 36
+    init_method_std: float = 0.018042
+    rotary_base: float = 1_000_000
+
+
+@dataclass
+class Starcoder2ModelProvider15B(Starcoder2ModelProvider):
+    """
+    Model Provider for the Starcoder2 15B, inheriting from Starcoder2ModelProvider.
+    """
+
+    num_layers: int = 40
+    hidden_size: int = 6144
+    ffn_hidden_size: int = 24576
+    num_query_groups: int = 4
+    num_attention_heads: int = 48
+    init_method_std: float = 0.01275
+    rotary_base: float = 100_000
diff --git a/src/megatron/bridge/models/starcoder/starcoder_provider.py b/src/megatron/bridge/models/starcoder/starcoder_provider.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Callable
+
+import torch.nn.functional as F
+
+from megatron.bridge.models.gpt_provider import GPTModelProvider
+
+
+@dataclass
+class StarcoderModelProvider(GPTModelProvider):
+    """
+    Model Provider class for Starcoder, inheriting from GPTModelProvider.
+    """
+
+    # configs that are common across model sizes
+    normalization: str = "LayerNorm"
+    activation_func: Callable = F.gelu
+    add_bias_linear: bool = True
+    seq_length: int = 8192
+    position_embedding_type: str = "learned_absolute"
+    hidden_dropout: float = 0.2
+    attention_dropout: float = 0.2
+    init_method_std: float = 0.01
+    layernorm_epsilon: float = 1e-5
+    share_embeddings_and_output_weights: bool = False
+    kv_channels: int = None
+    num_query_groups: int = 1
+    attention_softmax_in_fp32: bool = True
+    bias_activation_fusion: bool = True
+    bias_dropout_fusion: bool = True
+
+
+@dataclass
+class StarcoderModelProvider15B(StarcoderModelProvider):
+    """
+    Model Provider for the Starcoder 15B, inheriting from StarcoderModelProvider.
+    """
+
+    num_layers: int = 40
+    hidden_size: int = 6144
+    ffn_hidden_size: int = 24576
+    num_attention_heads: int = 48
+    init_method_std: float = 0.02
diff --git a/tests/unit_tests/models/starcoder/__init__.py b/tests/unit_tests/models/starcoder/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.