Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/megatron/bridge/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@
Qwen25ModelProvider72B,
Qwen25ModelProvider500M,
)
from megatron.bridge.models.starcoder import (
Starcoder2ModelProvider,
Starcoder2ModelProvider3B,
Starcoder2ModelProvider7B,
Starcoder2ModelProvider15B,
StarcoderModelProvider,
StarcoderModelProvider15B,
)
from megatron.bridge.models.t5_provider import T5ModelProvider


Expand Down Expand Up @@ -130,4 +138,10 @@
"Qwen3MoEModelProvider",
"Qwen3MoEModelProvider30B_A3B",
"Qwen3MoEModelProvider235B_A22B",
"Starcoder2ModelProvider",
"Starcoder2ModelProvider3B",
"Starcoder2ModelProvider7B",
"Starcoder2ModelProvider15B",
"StarcoderModelProvider",
"StarcoderModelProvider15B",
]
34 changes: 34 additions & 0 deletions src/megatron/bridge/models/starcoder/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from megatron.bridge.models.starcoder.starcoder2_provider import (
Starcoder2ModelProvider,
Starcoder2ModelProvider3B,
Starcoder2ModelProvider7B,
Starcoder2ModelProvider15B,
)
from megatron.bridge.models.starcoder.starcoder_provider import (
StarcoderModelProvider,
StarcoderModelProvider15B,
)


__all__ = [
"StarcoderModelProvider",
"StarcoderModelProvider15B",
"Starcoder2ModelProvider",
"Starcoder2ModelProvider3B",
"Starcoder2ModelProvider7B",
"Starcoder2ModelProvider15B",
]
91 changes: 91 additions & 0 deletions src/megatron/bridge/models/starcoder/starcoder2_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass
from typing import Callable, List, Optional

import torch.nn.functional as F

from megatron.bridge.models.gpt_provider import GPTModelProvider


@dataclass
class Starcoder2ModelProvider(GPTModelProvider):
"""
Model Provider class for Starcoder2, inheriting from GPTModelProvider.
"""

# configs that are common across model sizes
normalization: str = "LayerNorm"
activation_func: Callable = F.gelu
add_bias_linear: bool = True
seq_length: int = 16384
position_embedding_type: str = "rope"
rotary_percent: float = 1.0
hidden_dropout: float = 0.0
attention_dropout: float = 0.0
init_method_std: float = 0.01
share_embeddings_and_output_weights: bool = False
kv_channels: int = None
num_query_groups: int = None
window_size: Optional[List[int]] = None
attention_softmax_in_fp32: bool = True
bias_activation_fusion: bool = True
bias_dropout_fusion: bool = True
layernorm_epsilon: float = 1e-5


@dataclass
class Starcoder2ModelProvider3B(Starcoder2ModelProvider):
"""
Model Provider for the Starcoder2 3B, inheriting from Starcoder2ModelProvider.
"""

num_layers: int = 30
hidden_size: int = 3072
ffn_hidden_size: int = 12288
num_query_groups: int = 2
num_attention_heads: int = 24
init_method_std: float = 0.018042
rotary_base: float = 999999.4420358813


@dataclass
class Starcoder2ModelProvider7B(Starcoder2ModelProvider):
"""
Model Provider for the Starcoder2 7B, inheriting from Starcoder2ModelProvider.
"""

num_layers: int = 32
hidden_size: int = 4608
ffn_hidden_size: int = 18432
num_query_groups: int = 4
num_attention_heads: int = 36
init_method_std: float = 0.018042
rotary_base: float = 1_000_000


@dataclass
class Starcoder2ModelProvider15B(Starcoder2ModelProvider):
"""
Model Provider for the Starcoder2 15B, inheriting from Starcoder2ModelProvider.
"""

num_layers: int = 40
hidden_size: int = 6144
ffn_hidden_size: int = 24576
num_query_groups: int = 4
num_attention_heads: int = 48
init_method_std: float = 0.01275
rotary_base: float = 100_000
57 changes: 57 additions & 0 deletions src/megatron/bridge/models/starcoder/starcoder_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass
from typing import Callable

import torch.nn.functional as F

from megatron.bridge.models.gpt_provider import GPTModelProvider


@dataclass
class StarcoderModelProvider(GPTModelProvider):
"""
Model Provider class for Starcoder, inheriting from GPTModelProvider.
"""

# configs that are common across model sizes
normalization: str = "LayerNorm"
activation_func: Callable = F.gelu
add_bias_linear: bool = True
seq_length: int = 8192
position_embedding_type: str = "learned_absolute"
hidden_dropout: float = 0.2
attention_dropout: float = 0.2
init_method_std: float = 0.01
layernorm_epsilon: float = 1e-5
share_embeddings_and_output_weights: bool = False
kv_channels: int = None
num_query_groups: int = 1
attention_softmax_in_fp32: bool = True
bias_activation_fusion: bool = True
bias_dropout_fusion: bool = True


@dataclass
class StarcoderModelProvider15B(StarcoderModelProvider):
"""
Model Provider for the Starcoder 15B, inheriting from StarcoderModelProvider.
"""

num_layers: int = 40
hidden_size: int = 6144
ffn_hidden_size: int = 24576
num_attention_heads: int = 48
init_method_std: float = 0.02
13 changes: 13 additions & 0 deletions tests/unit_tests/models/starcoder/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Loading
Loading