NVIDIA · greg-kwasniewski1 · Oct 1, 2025 · Oct 2, 2025 · coderabbitai · Oct 2, 2025
@@ -0,0 +1,4 @@
+from transformers.models.starcoder2.configuration_starcoder2 import Starcoder2Config
-from transformers.models.starcoder2.configuration_starcoder2 import Starcoder2Config
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers.models.starcoder2.configuration_starcoder2 import Starcoder2Config
-from transformers.models.starcoder2.configuration_starcoder2 import Starcoder2Config
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers.models.starcoder2.configuration_starcoder2 import Starcoder2Config
+
+# Remove this patch after TRT-LLM upgrades to the HF transformers version >= 4.57
+Starcoder2Config.base_model_tp_plan["layers.*.mlp.c_proj"] = "rowwise"
@@ -292,7 +292,7 @@ def detect_sharding_from_factory_config(
     num_simple_shards = 0
     num_row_col_shards = 0
 
-    for lin_node in filtered_nodes(gm.graph.nodes, is_linear_op):
+    for lin_node in filtered_nodes(gm.graph.nodes, [is_linear_op, is_fake_quantized_linear_op]):
         # use node's weight name to get the module name
         module_name = lin_node.args[1].target
 
@@ -368,7 +368,7 @@ def detect_sharding_from_factory_config(
                             )
                             num_row_col_shards += 1
                         else:
-                            ad_logger.warning("Invalid sharding config. Skipping.")
+                            ad_logger.warning(f"Unsupported sharding action {config}. Skipping.")
                     else:
                         # TODO: local refers to hybrid EP+TP parallelism. Not supported yet.
                         ad_logger.warning("Local EP+TP sharding is not supported yet. Skipping.")
@@ -387,7 +387,19 @@ def detect_sharding_from_factory_config(
                     )
                     num_simple_shards += 1
                 else:
-                    ad_logger.warning("Invalid sharding config. Skipping.")
+                    ad_logger.warning(
+                        f"Unsupported sharding action {config}. Fallback to simple shard"
+                    )
+                    sharding_config.tp_transforms.append(
+                        TPShardingInfo.from_node(
+                            lin_node,
+                            split_dim=SplitDimension.COLUMN,
+                            rank=rank,
+                            world_size=world_size,
+                            dist_op="all_gather",
+                            min_local_shape=1,
+                        )
+                    )
                 # after successful match, break the loop
                 break
 

@@ -239,6 +239,11 @@ def filtered_nodes(
         for node in nodes:
             if target(node):
                 yield node
+    elif isinstance(target, Iterable) and all(isinstance(t, Callable) for t in target):
+        for node in nodes:
+            for t in target:
+                if t(node):
+                    yield node
-    elif isinstance(target, Iterable) and all(isinstance(t, Callable) for t in target):
-        for node in nodes:
-            for t in target:
-                if t(node):
-                    yield node
+    elif isinstance(target, Iterable) and all(isinstance(t, Callable) for t in target):
+        for node in nodes:
+            for t in target:
+                if t(node):
+                    yield node
+                    break
-    elif isinstance(target, Iterable) and all(isinstance(t, Callable) for t in target):
-        for node in nodes:
-            for t in target:
-                if t(node):
-                    yield node
+    elif isinstance(target, Iterable) and all(isinstance(t, Callable) for t in target):
+        for node in nodes:
+            for t in target:
+                if t(node):
+                    yield node
+                    break
     else:
         # Handle the case where target or ops contains operations
         operations = ops if ops is not None else target