Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
626 changes: 298 additions & 328 deletions src/transformers/modeling_rope_utils.py

Large diffs are not rendered by default.

14 changes: 4 additions & 10 deletions src/transformers/models/apertus/configuration_apertus.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
from typing import Optional

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin


class ApertusConfig(PreTrainedConfig):
class ApertusConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin):
r"""
This is the configuration class to store the configuration of a [`ApertusModel`]. It is used to instantiate a Apertus
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
Expand Down Expand Up @@ -160,15 +160,9 @@ def __init__(
self.use_cache = use_cache
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
self.rope_parameters = rope_scaling or rope_parameters

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 12000000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)
self.rope_parameters = rope_parameters

kwargs = self.convert_rope_params_to_dict(default_theta=12000000.0, **kwargs)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
Expand Down
55 changes: 30 additions & 25 deletions src/transformers/models/apertus/modular_apertus.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
from torch import nn

from ...cache_utils import Cache
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, logging
from ..llama.configuration_llama import LlamaConfig
from ..llama.modeling_llama import (
LlamaAttention,
LlamaDecoderLayer,
Expand All @@ -43,7 +43,7 @@
logger = logging.get_logger(__name__)


class ApertusConfig(LlamaConfig):
class ApertusConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin):
r"""
This is the configuration class to store the configuration of a [`ApertusModel`]. It is used to instantiate a Apertus
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
Expand Down Expand Up @@ -116,6 +116,7 @@ class ApertusConfig(LlamaConfig):
```"""

model_type = "apertus"
keys_to_ignore_at_inference = ["past_key_values"]
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
"layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k
Expand All @@ -124,6 +125,11 @@ class ApertusConfig(LlamaConfig):
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
base_model_pp_plan = {
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
}

def __init__(
self,
Expand Down Expand Up @@ -154,35 +160,34 @@ def __init__(
attention_dropout: Optional[float] = 0.0,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads

# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads

self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.rope_parameters = rope_parameters

kwargs = self.convert_rope_params_to_dict(default_theta=12000000.0, **kwargs)
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
hidden_act=hidden_act,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
rms_norm_eps=rms_norm_eps,
use_cache=use_cache,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
rope_parameters=rope_parameters,
attention_bias=attention_bias,
attention_dropout=attention_dropout,
**kwargs,
)
del self.pretraining_tp
del self.mlp_bias
del self.head_dim

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 12000000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)


class ApertusMLP(NemotronMLP):
Expand Down
14 changes: 4 additions & 10 deletions src/transformers/models/arcee/configuration_arcee.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
from typing import Optional

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin


class ArceeConfig(PreTrainedConfig):
class ArceeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin):
r"""
This is the configuration class to store the configuration of a [`ArceeModel`]. It is used to instantiate an Arcee
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
Expand Down Expand Up @@ -163,14 +163,8 @@ def __init__(
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
self.rope_parameters = rope_scaling or rope_parameters

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 10000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)
self.rope_parameters = rope_parameters
kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs)

super().__init__(
pad_token_id=pad_token_id,
Expand Down
14 changes: 4 additions & 10 deletions src/transformers/models/aria/configuration_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
from typing import Optional

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin
from ..auto import CONFIG_MAPPING, AutoConfig


class AriaTextConfig(PreTrainedConfig):
class AriaTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin):
r"""
This class handles the configuration for the text component of the Aria model.
Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria
Expand Down Expand Up @@ -168,14 +168,8 @@ def __init__(
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
self.rope_parameters = rope_scaling or rope_parameters

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 10000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)
self.rope_parameters = rope_parameters
kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs)

super().__init__(
pad_token_id=pad_token_id,
Expand Down
18 changes: 6 additions & 12 deletions src/transformers/models/bamba/configuration_bamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
from typing import Optional

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin
from ...utils import logging


logger = logging.get_logger(__name__)


class BambaConfig(PreTrainedConfig):
class BambaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin):
r"""
This is the configuration class to store the configuration of a [`BambaModel`]. It is used to instantiate a
BambaModel model according to the specified arguments, defining the model architecture. Instantiating a configuration
Expand Down Expand Up @@ -171,16 +171,6 @@ def __init__(
self.num_logits_to_keep = num_logits_to_keep

self.attn_layer_indices = attn_layer_indices
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
self.partial_rotary_factor = 0.5
rope_scaling = kwargs.pop("rope_scaling", None)
self.rope_parameters = rope_scaling or rope_parameters

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 10000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)

mamba_intermediate = mamba_expand * hidden_size

if mamba_intermediate % mamba_n_heads != 0:
Expand All @@ -203,6 +193,10 @@ def __init__(
self.mamba_conv_bias = mamba_conv_bias
self.mamba_proj_bias = mamba_proj_bias
self.z_loss_coefficient = z_loss_coefficient
self.rope_parameters = rope_parameters

kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs)
self.rope_parameters["partial_rotary_factor"] = 0.5

super().__init__(
pad_token_id=pad_token_id,
Expand Down
14 changes: 4 additions & 10 deletions src/transformers/models/bitnet/configuration_bitnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
from typing import Optional

from ...configuration_utils import PreTrainedConfig
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin
from ...utils import logging


logger = logging.get_logger(__name__)


class BitNetConfig(PreTrainedConfig):
class BitNetConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin):
r"""
This is the configuration class to store the configuration of a [`BitNetModel`]. It is used to instantiate an BitNet
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
Expand Down Expand Up @@ -138,14 +138,8 @@ def __init__(
self.use_cache = use_cache
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
self.rope_parameters = rope_scaling or rope_parameters

# Validate the correctness of rotary position embeddings parameters
rope_theta = kwargs.get("rope_theta", 500000.0)
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)
self.rope_parameters = rope_parameters
kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs)

super().__init__(
pad_token_id=pad_token_id,
Expand Down
Loading
Loading