add pipeline benchmark for kvzch

TroyGarden · facebook-github-bot · commit 2b31193780f5 · 2025-12-09T23:51:23.000-08:00
Summary:
# context
* modify the kvzch benchmark configs to better represent the real use case
* add config pass-in to the test models
* fix small bugs and minior refactoring 


# changes
* previous kv-zch embedding table is too small the prefetch process is too short, after this change (increased table size) the prefetch process is longer
 {F1983784711}  {F1983784733}

# benchmark
|short name                         |GPU Runtime (P90)|CPU Runtime (P90)|GPU Peak Mem alloc (P90)|GPU Peak Mem reserved (P90)|GPU Mem used (P90)|Malloc retries (P50/P90/P100)|CPU Peak RSS (P90)|
|--|--|--|--|--|--|--|--|
|regular-base                       |9864.51 ms       |9403.68 ms       |33.77 GB                |49.66 GB                   |50.71 GB          |0.0 / 0.0 / 0.0              |30.65 GB          |
|kvzch-base                         |18804.26 ms      |44245.82 ms      |25.28 GB                |36.33 GB                   |37.38 GB          |0.0 / 0.0 / 0.0              |31.18 GB          |
|base-inplace                       |20141.71 ms      |46805.58 ms      |25.28 GB                |34.39 GB                   |35.44 GB          |0.0 / 0.0 / 0.0              |31.19 GB          |
|kvzch-sdd                          |20382.59 ms      |45647.02 ms      |33.42 GB                |47.52 GB                   |48.56 GB          |0.0 / 0.0 / 0.0              |31.13 GB          |
|kvzch-prefetch                     |17951.19 ms      |38598.57 ms      |33.45 GB                |47.16 GB                   |48.21 GB          |0.0 / 0.0 / 0.0              |30.83 GB          |

Differential Revision: D84268361
diff --git a/torchrec/distributed/benchmark/base.py b/torchrec/distributed/benchmark/base.py
@@ -504,14 +504,6 @@ def wrapper() -> Any:  # pyre-ignore [3]
                 help="JSON config file for benchmarking",
             )
 
-            # Add loglevel argument with current logger level as default
-            parser.add_argument(
-                "--loglevel",
-                type=str,
-                default=logging._levelToName[logger.level],
-                help="Set the logging level (e.g. info, debug, warning, error)",
-            )
-
             pre_args, _ = parser.parse_known_args()
 
             yaml_defaults: Dict[str, Any] = (
@@ -531,7 +523,6 @@ def wrapper() -> Any:  # pyre-ignore [3]
             seen_args = {
                 "json_config",
                 "yaml_config",
-                "loglevel",
             }
 
             for _name, param in sig.parameters.items():
diff --git a/torchrec/distributed/benchmark/yaml/prefetch_kvzch.yml b/torchrec/distributed/benchmark/yaml/prefetch_kvzch.yml
@@ -10,18 +10,33 @@ RunOptions:
   sharding_type: table_wise
   profile_dir: "."
   name: "sparsenn_prefetch_kvzch_dram"
+  memory_snapshot: True
+  loglevel: "info"
+  num_float_features: 1000
 PipelineConfig:
   pipeline: "prefetch"
+  # inplace_copy_batch_to_gpu: True
 ModelInputConfig:
-  feature_pooling_avg: 30
+  num_float_features: 1000
+  feature_pooling_avg: 60
+ModelSelectionConfig:
+  model_name: "test_sparse_nn"
+  model_config:
+    num_float_features: 1000
+    submodule_kwargs:
+      dense_arch_out_size: 1024
+      over_arch_out_size: 4096
+      over_arch_hidden_layers: 10
+      dense_arch_hidden_sizes: [128, 128, 128]
+
 EmbeddingTablesConfig:
-  num_unweighted_features: 10
-  num_weighted_features: 10
+  num_unweighted_features: 50
+  num_weighted_features: 50
   embedding_feature_dim: 256
   additional_tables:
     - - name: FP16_table
         embedding_dim: 512
-        num_embeddings: 100_000              # Both feature hashsize and virtual table size
+        num_embeddings: 1_000_000            # Both feature hashsize and virtual table size
         feature_names: ["additional_0_0"]
         data_type: FP16
         total_num_buckets: 100               # num_embedding should be divisible by total_num_buckets
diff --git a/torchrec/distributed/test_utils/model_config.py b/torchrec/distributed/test_utils/model_config.py
@@ -88,6 +88,7 @@ class TestSparseNNConfig(BaseModelConfig):
     over_arch_clazz: Type[nn.Module] = TestOverArchLarge
     postproc_module: Optional[nn.Module] = None
     zch: bool = False
+    submodule_kwargs: Optional[Dict[str, Any]] = None
 
     def generate_model(
         self,
@@ -108,6 +109,7 @@ def generate_model(
             postproc_module=self.postproc_module,
             embedding_groups=self.embedding_groups,
             zch=self.zch,
+            submodule_kwargs=self.submodule_kwargs,
         )
 
 
diff --git a/torchrec/distributed/test_utils/test_model.py b/torchrec/distributed/test_utils/test_model.py
@@ -881,6 +881,7 @@ def __init__(
         device: Optional[torch.device] = None,
         dense_arch_out_size: Optional[int] = None,
         dense_arch_hidden_sizes: Optional[List[int]] = None,
+        **_kwargs: Any,
     ) -> None:
         """
         Args:
@@ -1191,6 +1192,8 @@ def __init__(
         dense_arch_out_size: Optional[int] = None,
         over_arch_out_size: Optional[int] = None,
         over_arch_hidden_layers: Optional[int] = None,
+        over_arch_hidden_repeat: Optional[int] = None,
+        **_kwargs: Any,
     ) -> None:
         """
         Args:
@@ -1237,7 +1240,8 @@ def __init__(
                 ),
                 SwishLayerNorm([out_features]),
             ]
-
+        for _ in range(over_arch_hidden_repeat or 0):
+            layers += layers[1:]
         self.overarch = torch.nn.Sequential(*layers)
 
         self.regroup_module = KTRegroupAsDict(
@@ -1398,6 +1402,7 @@ def __init__(
         weighted_tables: List[EmbeddingBagConfig],
         device: Optional[torch.device] = None,
         max_feature_lengths: Optional[Dict[str, int]] = None,
+        **_kwargs: Any,
     ) -> None:
         """
         Args:
@@ -1547,6 +1552,7 @@ def __init__(
         over_arch_clazz: Optional[Type[nn.Module]] = None,
         postproc_module: Optional[nn.Module] = None,
         zch: bool = False,
+        submodule_kwargs: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__(
             tables=cast(List[BaseEmbeddingConfig], tables),
@@ -1559,7 +1565,9 @@ def __init__(
             over_arch_clazz = TestOverArch
         if weighted_tables is None:
             weighted_tables = []
-        self.dense = TestDenseArch(num_float_features, device=dense_device)
+        self.dense = TestDenseArch(
+            num_float_features, device=dense_device, **(submodule_kwargs or {})
+        )
         if zch:
             self.sparse: nn.Module = TestEBCSparseArchZCH(
                 tables,  # pyre-ignore
@@ -1571,13 +1579,15 @@ def __init__(
             self.sparse = TestECSparseArch(
                 tables,  # pyre-ignore [6]
                 sparse_device,
+                **(submodule_kwargs or {}),
             )
         else:
             self.sparse = TestEBCSparseArch(
                 tables,  # pyre-ignore
                 weighted_tables,
                 sparse_device,
                 max_feature_lengths,
+                **(submodule_kwargs or {}),
             )
 
         embedding_names = (
@@ -1596,6 +1606,7 @@ def __init__(
             weighted_tables,
             embedding_names,
             dense_device,
+            **(submodule_kwargs or {}),
         )
         self.register_buffer(
             "dummy_ones",