add pipeline benchmark for kvzch (#3604)

TroyGarden · meta-codesync[bot] · commit 223db0de0669 · 2025-12-10T19:25:00.000-08:00
Summary: Pull Request resolved: #3604 # context * modify the kvzch benchmark configs to better represent the real use case * add config pass-in to the test models * fix small bugs and minior refactoring # changes * previous kv-zch embedding table is too small the prefetch process is too short, after this change (increased table size) the prefetch process is longer {F1983784711} {F1983784733} # benchmark |short name |GPU Runtime (P90)|CPU Runtime (P90)|GPU Peak Mem alloc (P90)|GPU Peak Mem reserved (P90)|GPU Mem used (P90)|Malloc retries (P50/P90/P100)|CPU Peak RSS (P90)| |--|--|--|--|--|--|--|--| |regular-base |9864.51 ms |9403.68 ms |33.77 GB |49.66 GB |50.71 GB |0.0 / 0.0 / 0.0 |30.65 GB | |kvzch-base |18804.26 ms |44245.82 ms |25.28 GB |36.33 GB |37.38 GB |0.0 / 0.0 / 0.0 |31.18 GB | |base-inplace |20141.71 ms |46805.58 ms |25.28 GB |34.39 GB |35.44 GB |0.0 / 0.0 / 0.0 |31.19 GB | |kvzch-sdd |20382.59 ms |45647.02 ms |33.42 GB |47.52 GB |48.56 GB |0.0 / 0.0 / 0.0 |31.13 GB | |kvzch-prefetch |17951.19 ms |38598.57 ms |33.45 GB |47.16 GB |48.21 GB |0.0 / 0.0 / 0.0 |30.83 GB | |regular-base |49710.51 ms |74880.50 ms |43.14 GB |50.63 GB |51.68 GB |0.0 / 0.0 / 0.0 |33.57 GB | Reviewed By: spmex Differential Revision: D84268361 fbshipit-source-id: e28abb6fb6ccb1121dcf4ae778e26520b454da8c
diff --git a/torchrec/distributed/benchmark/yaml/prefetch_kvzch.yml b/torchrec/distributed/benchmark/yaml/prefetch_kvzch.yml
@@ -10,18 +10,33 @@ RunOptions:
   sharding_type: table_wise
   profile_dir: "."
   name: "sparsenn_prefetch_kvzch_dram"
+  memory_snapshot: True
+  loglevel: "info"
+  num_float_features: 1000
 PipelineConfig:
   pipeline: "prefetch"
+  # inplace_copy_batch_to_gpu: True
 ModelInputConfig:
-  feature_pooling_avg: 30
+  num_float_features: 1000
+  feature_pooling_avg: 60
+ModelSelectionConfig:
+  model_name: "test_sparse_nn"
+  model_config:
+    num_float_features: 1000
+    submodule_kwargs:
+      dense_arch_out_size: 1024
+      over_arch_out_size: 4096
+      over_arch_hidden_layers: 10
+      dense_arch_hidden_sizes: [128, 128, 128]
+
 EmbeddingTablesConfig:
-  num_unweighted_features: 10
-  num_weighted_features: 10
+  num_unweighted_features: 50
+  num_weighted_features: 50
   embedding_feature_dim: 256
   additional_tables:
     - - name: FP16_table
         embedding_dim: 512
-        num_embeddings: 100_000              # Both feature hashsize and virtual table size
+        num_embeddings: 1_000_000            # Both feature hashsize and virtual table size
         feature_names: ["additional_0_0"]
         data_type: FP16
         total_num_buckets: 100               # num_embedding should be divisible by total_num_buckets
diff --git a/torchrec/distributed/test_utils/model_config.py b/torchrec/distributed/test_utils/model_config.py
@@ -88,6 +88,7 @@ class TestSparseNNConfig(BaseModelConfig):
     over_arch_clazz: Type[nn.Module] = TestOverArchLarge
     postproc_module: Optional[nn.Module] = None
     zch: bool = False
+    submodule_kwargs: Optional[Dict[str, Any]] = None
 
     def generate_model(
         self,
@@ -108,6 +109,7 @@ def generate_model(
             postproc_module=self.postproc_module,
             embedding_groups=self.embedding_groups,
             zch=self.zch,
+            submodule_kwargs=self.submodule_kwargs,
         )
 
 
diff --git a/torchrec/distributed/test_utils/test_model.py b/torchrec/distributed/test_utils/test_model.py
@@ -881,6 +881,7 @@ def __init__(
         device: Optional[torch.device] = None,
         dense_arch_out_size: Optional[int] = None,
         dense_arch_hidden_sizes: Optional[List[int]] = None,
+        **_kwargs: Any,
     ) -> None:
         """
         Args:
@@ -1191,6 +1192,8 @@ def __init__(
         dense_arch_out_size: Optional[int] = None,
         over_arch_out_size: Optional[int] = None,
         over_arch_hidden_layers: Optional[int] = None,
+        over_arch_hidden_repeat: Optional[int] = None,
+        **_kwargs: Any,
     ) -> None:
         """
         Args:
@@ -1237,7 +1240,8 @@ def __init__(
                 ),
                 SwishLayerNorm([out_features]),
             ]
-
+        for _ in range(over_arch_hidden_repeat or 0):
+            layers += layers[1:]
         self.overarch = torch.nn.Sequential(*layers)
 
         self.regroup_module = KTRegroupAsDict(
@@ -1398,6 +1402,7 @@ def __init__(
         weighted_tables: List[EmbeddingBagConfig],
         device: Optional[torch.device] = None,
         max_feature_lengths: Optional[Dict[str, int]] = None,
+        **_kwargs: Any,
     ) -> None:
         """
         Args:
@@ -1547,6 +1552,7 @@ def __init__(
         over_arch_clazz: Optional[Type[nn.Module]] = None,
         postproc_module: Optional[nn.Module] = None,
         zch: bool = False,
+        submodule_kwargs: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__(
             tables=cast(List[BaseEmbeddingConfig], tables),
@@ -1559,7 +1565,9 @@ def __init__(
             over_arch_clazz = TestOverArch
         if weighted_tables is None:
             weighted_tables = []
-        self.dense = TestDenseArch(num_float_features, device=dense_device)
+        self.dense = TestDenseArch(
+            num_float_features, device=dense_device, **(submodule_kwargs or {})
+        )
         if zch:
             self.sparse: nn.Module = TestEBCSparseArchZCH(
                 tables,  # pyre-ignore
@@ -1571,13 +1579,15 @@ def __init__(
             self.sparse = TestECSparseArch(
                 tables,  # pyre-ignore [6]
                 sparse_device,
+                **(submodule_kwargs or {}),
             )
         else:
             self.sparse = TestEBCSparseArch(
                 tables,  # pyre-ignore
                 weighted_tables,
                 sparse_device,
                 max_feature_lengths,
+                **(submodule_kwargs or {}),
             )
 
         embedding_names = (
@@ -1596,6 +1606,7 @@ def __init__(
             weighted_tables,
             embedding_names,
             dense_device,
+            **(submodule_kwargs or {}),
         )
         self.register_buffer(
             "dummy_ones",