diff --git a/src/megatron/bridge/models/__init__.py b/src/megatron/bridge/models/__init__.py index 4338bbbc4e..eb2925ebb0 100644 --- a/src/megatron/bridge/models/__init__.py +++ b/src/megatron/bridge/models/__init__.py @@ -72,6 +72,14 @@ Qwen25ModelProvider72B, Qwen25ModelProvider500M, ) +from megatron.bridge.models.starcoder import ( + Starcoder2ModelProvider, + Starcoder2ModelProvider3B, + Starcoder2ModelProvider7B, + Starcoder2ModelProvider15B, + StarcoderModelProvider, + StarcoderModelProvider15B, +) from megatron.bridge.models.t5_provider import T5ModelProvider @@ -130,4 +138,10 @@ "Qwen3MoEModelProvider", "Qwen3MoEModelProvider30B_A3B", "Qwen3MoEModelProvider235B_A22B", + "Starcoder2ModelProvider", + "Starcoder2ModelProvider3B", + "Starcoder2ModelProvider7B", + "Starcoder2ModelProvider15B", + "StarcoderModelProvider", + "StarcoderModelProvider15B", ] diff --git a/src/megatron/bridge/models/starcoder/__init__.py b/src/megatron/bridge/models/starcoder/__init__.py new file mode 100644 index 0000000000..145f753be6 --- /dev/null +++ b/src/megatron/bridge/models/starcoder/__init__.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from megatron.bridge.models.starcoder.starcoder2_provider import ( + Starcoder2ModelProvider, + Starcoder2ModelProvider3B, + Starcoder2ModelProvider7B, + Starcoder2ModelProvider15B, +) +from megatron.bridge.models.starcoder.starcoder_provider import ( + StarcoderModelProvider, + StarcoderModelProvider15B, +) + + +__all__ = [ + "StarcoderModelProvider", + "StarcoderModelProvider15B", + "Starcoder2ModelProvider", + "Starcoder2ModelProvider3B", + "Starcoder2ModelProvider7B", + "Starcoder2ModelProvider15B", +] diff --git a/src/megatron/bridge/models/starcoder/starcoder2_provider.py b/src/megatron/bridge/models/starcoder/starcoder2_provider.py new file mode 100644 index 0000000000..d04df26623 --- /dev/null +++ b/src/megatron/bridge/models/starcoder/starcoder2_provider.py @@ -0,0 +1,91 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Callable, List, Optional + +import torch.nn.functional as F + +from megatron.bridge.models.gpt_provider import GPTModelProvider + + +@dataclass +class Starcoder2ModelProvider(GPTModelProvider): + """ + Model Provider class for Starcoder2, inheriting from GPTModelProvider. + """ + + # configs that are common across model sizes + normalization: str = "LayerNorm" + activation_func: Callable = F.gelu + add_bias_linear: bool = True + seq_length: int = 16384 + position_embedding_type: str = "rope" + rotary_percent: float = 1.0 + hidden_dropout: float = 0.0 + attention_dropout: float = 0.0 + init_method_std: float = 0.01 + share_embeddings_and_output_weights: bool = False + kv_channels: int = None + num_query_groups: int = None + window_size: Optional[List[int]] = None + attention_softmax_in_fp32: bool = True + bias_activation_fusion: bool = True + bias_dropout_fusion: bool = True + layernorm_epsilon: float = 1e-5 + + +@dataclass +class Starcoder2ModelProvider3B(Starcoder2ModelProvider): + """ + Model Provider for the Starcoder2 3B, inheriting from Starcoder2ModelProvider. + """ + + num_layers: int = 30 + hidden_size: int = 3072 + ffn_hidden_size: int = 12288 + num_query_groups: int = 2 + num_attention_heads: int = 24 + init_method_std: float = 0.018042 + rotary_base: float = 999999.4420358813 + + +@dataclass +class Starcoder2ModelProvider7B(Starcoder2ModelProvider): + """ + Model Provider for the Starcoder2 7B, inheriting from Starcoder2ModelProvider. + """ + + num_layers: int = 32 + hidden_size: int = 4608 + ffn_hidden_size: int = 18432 + num_query_groups: int = 4 + num_attention_heads: int = 36 + init_method_std: float = 0.018042 + rotary_base: float = 1_000_000 + + +@dataclass +class Starcoder2ModelProvider15B(Starcoder2ModelProvider): + """ + Model Provider for the Starcoder2 15B, inheriting from Starcoder2ModelProvider. + """ + + num_layers: int = 40 + hidden_size: int = 6144 + ffn_hidden_size: int = 24576 + num_query_groups: int = 4 + num_attention_heads: int = 48 + init_method_std: float = 0.01275 + rotary_base: float = 100_000 diff --git a/src/megatron/bridge/models/starcoder/starcoder_provider.py b/src/megatron/bridge/models/starcoder/starcoder_provider.py new file mode 100644 index 0000000000..064c56d282 --- /dev/null +++ b/src/megatron/bridge/models/starcoder/starcoder_provider.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Callable + +import torch.nn.functional as F + +from megatron.bridge.models.gpt_provider import GPTModelProvider + + +@dataclass +class StarcoderModelProvider(GPTModelProvider): + """ + Model Provider class for Starcoder, inheriting from GPTModelProvider. + """ + + # configs that are common across model sizes + normalization: str = "LayerNorm" + activation_func: Callable = F.gelu + add_bias_linear: bool = True + seq_length: int = 8192 + position_embedding_type: str = "learned_absolute" + hidden_dropout: float = 0.2 + attention_dropout: float = 0.2 + init_method_std: float = 0.01 + layernorm_epsilon: float = 1e-5 + share_embeddings_and_output_weights: bool = False + kv_channels: int = None + num_query_groups: int = 1 + attention_softmax_in_fp32: bool = True + bias_activation_fusion: bool = True + bias_dropout_fusion: bool = True + + +@dataclass +class StarcoderModelProvider15B(StarcoderModelProvider): + """ + Model Provider for the Starcoder 15B, inheriting from StarcoderModelProvider. + """ + + num_layers: int = 40 + hidden_size: int = 6144 + ffn_hidden_size: int = 24576 + num_attention_heads: int = 48 + init_method_std: float = 0.02 diff --git a/tests/unit_tests/models/starcoder/__init__.py b/tests/unit_tests/models/starcoder/__init__.py new file mode 100644 index 0000000000..341a77c5bc --- /dev/null +++ b/tests/unit_tests/models/starcoder/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit_tests/models/starcoder/test_starcoder2_provider.py b/tests/unit_tests/models/starcoder/test_starcoder2_provider.py new file mode 100644 index 0000000000..1817f34198 --- /dev/null +++ b/tests/unit_tests/models/starcoder/test_starcoder2_provider.py @@ -0,0 +1,192 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch.nn.functional as F + +from megatron.bridge.models.starcoder.starcoder2_provider import ( + Starcoder2ModelProvider, + Starcoder2ModelProvider3B, + Starcoder2ModelProvider7B, + Starcoder2ModelProvider15B, +) + + +class TestStarcoder2ModelProvider: + """Test cases for Starcoder2ModelProvider class.""" + + def test_starcoder2_model_provider_defaults(self): + """Test Starcoder2ModelProvider has correct default values.""" + provider = Starcoder2ModelProvider( + num_layers=12, + hidden_size=768, + num_attention_heads=12, + ) + + # Check required transformer config fields + assert provider.num_layers == 12 + assert provider.hidden_size == 768 + assert provider.num_attention_heads == 12 + + # Check Starcoder2-specific defaults + transformer config post init + assert provider.normalization == "LayerNorm" + assert provider.activation_func == F.gelu + assert provider.add_bias_linear is True + assert provider.seq_length == 16384 + assert provider.position_embedding_type == "rope" + assert provider.rotary_percent == 1.0 + assert provider.hidden_dropout == 0.0 + assert provider.attention_dropout == 0.0 + assert provider.init_method_std == 0.01 + assert provider.share_embeddings_and_output_weights is False + assert provider.kv_channels == 64 + assert provider.num_query_groups == 12 + assert provider.window_size is None + assert provider.attention_softmax_in_fp32 is True + assert provider.bias_activation_fusion is True + assert provider.bias_dropout_fusion is True + assert provider.layernorm_epsilon == 1e-5 + + +class TestStarcoder2ModelProvider3B: + """Test cases for Starcoder2ModelProvider3B class.""" + + def test_starcoder2_3b_defaults(self): + """Test Starcoder2ModelProvider3B has correct default values for 3B model.""" + provider = Starcoder2ModelProvider3B() + + # Check 3B-specific configuration + assert provider.num_layers == 30 + assert provider.hidden_size == 3072 + assert provider.ffn_hidden_size == 12288 + assert provider.num_query_groups == 2 + assert provider.num_attention_heads == 24 + assert provider.init_method_std == 0.018042 + assert provider.rotary_base == 999999.4420358813 + + # Check inherited Starcoder2 defaults + assert provider.normalization == "LayerNorm" + assert provider.activation_func == F.gelu + assert provider.add_bias_linear is True + assert provider.seq_length == 16384 + assert provider.position_embedding_type == "rope" + assert provider.rotary_percent == 1.0 + assert provider.hidden_dropout == 0.0 + assert provider.attention_dropout == 0.0 + assert provider.share_embeddings_and_output_weights is False + assert provider.window_size is None + assert provider.attention_softmax_in_fp32 is True + assert provider.bias_activation_fusion is True + assert provider.bias_dropout_fusion is True + assert provider.layernorm_epsilon == 1e-5 + + +class TestStarcoder2ModelProvider7B: + """Test cases for Starcoder2ModelProvider7B class.""" + + def test_starcoder2_7b_defaults(self): + """Test Starcoder2ModelProvider7B has correct default values for 7B model.""" + provider = Starcoder2ModelProvider7B() + + # Check 7B-specific configuration + assert provider.num_layers == 32 + assert provider.hidden_size == 4608 + assert provider.ffn_hidden_size == 18432 + assert provider.num_query_groups == 4 + assert provider.num_attention_heads == 36 + assert provider.init_method_std == 0.018042 + assert provider.rotary_base == 1_000_000 + + # Check inherited Starcoder2 defaults + assert provider.normalization == "LayerNorm" + assert provider.activation_func == F.gelu + assert provider.add_bias_linear is True + assert provider.seq_length == 16384 + assert provider.position_embedding_type == "rope" + assert provider.rotary_percent == 1.0 + assert provider.hidden_dropout == 0.0 + assert provider.attention_dropout == 0.0 + assert provider.share_embeddings_and_output_weights is False + assert provider.kv_channels is 128 + assert provider.window_size is None + assert provider.attention_softmax_in_fp32 is True + assert provider.bias_activation_fusion is True + assert provider.bias_dropout_fusion is True + assert provider.layernorm_epsilon == 1e-5 + + +class TestStarcoder2ModelProvider15B: + """Test cases for Starcoder2ModelProvider15B class.""" + + def test_starcoder2_15b_defaults(self): + """Test Starcoder2ModelProvider15B has correct default values for 15B model.""" + provider = Starcoder2ModelProvider15B() + + # Check 15B-specific configuration + assert provider.num_layers == 40 + assert provider.hidden_size == 6144 + assert provider.ffn_hidden_size == 24576 + assert provider.num_query_groups == 4 + assert provider.num_attention_heads == 48 + assert provider.init_method_std == 0.01275 + assert provider.rotary_base == 100_000 + + # Check inherited Starcoder2 defaults + assert provider.normalization == "LayerNorm" + assert provider.activation_func == F.gelu + assert provider.add_bias_linear is True + assert provider.seq_length == 16384 + assert provider.position_embedding_type == "rope" + assert provider.rotary_percent == 1.0 + assert provider.hidden_dropout == 0.0 + assert provider.attention_dropout == 0.0 + assert provider.share_embeddings_and_output_weights is False + assert provider.kv_channels == 128 + assert provider.window_size is None + assert provider.attention_softmax_in_fp32 is True + assert provider.bias_activation_fusion is True + assert provider.bias_dropout_fusion is True + assert provider.layernorm_epsilon == 1e-5 + + +class TestStarcoder2ProviderInheritance: + """Test inheritance relationships between Starcoder2 providers.""" + + def test_starcoder2_models_inherit_from_base(self): + """Test Starcoder2 providers inherit from Starcoder2ModelProvider.""" + assert issubclass(Starcoder2ModelProvider3B, Starcoder2ModelProvider) + assert issubclass(Starcoder2ModelProvider7B, Starcoder2ModelProvider) + assert issubclass(Starcoder2ModelProvider15B, Starcoder2ModelProvider) + + def test_starcoder2_models_inherit_from_gpt(self): + """Test Starcoder2 providers inherit from GPTModelProvider.""" + from megatron.bridge.models.gpt_provider import GPTModelProvider + + assert issubclass(Starcoder2ModelProvider, GPTModelProvider) + assert issubclass(Starcoder2ModelProvider3B, GPTModelProvider) + assert issubclass(Starcoder2ModelProvider7B, GPTModelProvider) + assert issubclass(Starcoder2ModelProvider15B, GPTModelProvider) + + def test_provide_method_inherited(self): + """Test that provide method works correctly in inherited classes.""" + # Test with all Starcoder2 providers + providers = [ + Starcoder2ModelProvider3B(), + Starcoder2ModelProvider7B(), + Starcoder2ModelProvider15B(), + ] + + for provider in providers: + # The provide method should be inherited from GPTModelProvider + assert hasattr(provider, "provide") + assert callable(provider.provide) diff --git a/tests/unit_tests/models/starcoder/test_starcoder_provider.py b/tests/unit_tests/models/starcoder/test_starcoder_provider.py new file mode 100644 index 0000000000..ec30ef33b8 --- /dev/null +++ b/tests/unit_tests/models/starcoder/test_starcoder_provider.py @@ -0,0 +1,103 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch.nn.functional as F + +from megatron.bridge.models.starcoder.starcoder_provider import ( + StarcoderModelProvider, + StarcoderModelProvider15B, +) + + +class TestStarcoderModelProvider: + """Test cases for StarcoderModelProvider class.""" + + def test_starcoder_model_provider_defaults(self): + """Test StarcoderModelProvider has correct default values.""" + provider = StarcoderModelProvider( + num_layers=12, + hidden_size=768, + num_attention_heads=12, + ) + + # Check required transformer config fields + assert provider.num_layers == 12 + assert provider.hidden_size == 768 + assert provider.num_attention_heads == 12 + + # Check Starcoder-specific defaults + assert provider.normalization == "LayerNorm" + assert provider.activation_func == F.gelu + assert provider.add_bias_linear is True + assert provider.seq_length == 8192 + assert provider.position_embedding_type == "learned_absolute" + assert provider.hidden_dropout == 0.2 + assert provider.attention_dropout == 0.2 + assert provider.init_method_std == 0.01 + assert provider.layernorm_epsilon == 1e-5 + assert provider.share_embeddings_and_output_weights is False + assert provider.kv_channels == 64 + assert provider.num_query_groups == 1 + assert provider.attention_softmax_in_fp32 is True + assert provider.bias_activation_fusion is True + assert provider.bias_dropout_fusion is True + + +class TestStarcoderModelProvider15B: + """Test cases for StarcoderModelProvider15B class.""" + + def test_starcoder_model_provider_15b_defaults(self): + """Test StarcoderModelProvider15B has correct default values for 15B model.""" + provider = StarcoderModelProvider15B() + + # Check 15B-specific configuration + assert provider.num_layers == 40 + assert provider.hidden_size == 6144 + assert provider.ffn_hidden_size == 24576 + assert provider.num_attention_heads == 48 + assert provider.init_method_std == 0.02 + + # Check inherited Starcoder defaults + assert provider.normalization == "LayerNorm" + assert provider.activation_func == F.gelu + assert provider.add_bias_linear is True + assert provider.seq_length == 8192 + assert provider.position_embedding_type == "learned_absolute" + assert provider.hidden_dropout == 0.2 + assert provider.attention_dropout == 0.2 + assert provider.layernorm_epsilon == 1e-5 + assert provider.share_embeddings_and_output_weights is False + assert provider.kv_channels == 128 + assert provider.num_query_groups == 1 + assert provider.attention_softmax_in_fp32 is True + assert provider.bias_activation_fusion is True + assert provider.bias_dropout_fusion is True + + +class TestStarcoderProviderInheritance: + """Test inheritance relationships between Starcoder providers.""" + + def test_starcoder_models_inherit_from_gpt(self): + """Test Starcoder providers inherit from GPTModelProvider.""" + from megatron.bridge.models.gpt_provider import GPTModelProvider + + assert issubclass(StarcoderModelProvider, GPTModelProvider) + assert issubclass(StarcoderModelProvider15B, StarcoderModelProvider) + assert issubclass(StarcoderModelProvider15B, GPTModelProvider) + + def test_provide_method_inherited(self): + """Test that provide method works correctly in inherited classes.""" + provider = StarcoderModelProvider15B() + assert hasattr(provider, "provide") + assert callable(provider.provide)