tests: add tests for ECEPartitioner

ChenZiHong-Gavin · ChenZiHong-Gavin · commit f072c2eec3e5 · 2025-10-14T14:58:36.000+08:00
diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml
@@ -13,14 +13,9 @@ quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
 partition: # graph partition configuration
   method: ece # ece is a custom partition method based on comprehension loss
   method_params:
-    bidirectional: true # whether to traverse the graph in both directions
-    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-    expand_method: max_width # expand method, support: max_width, max_depth
-    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-    max_depth: 5 # maximum depth for graph traversal
-    max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
-    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+    max_units_per_community: 10 # max nodes and edges per community
+    max_tokens_per_community: 10240 # max tokens per community
+    unit_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
 generate:
   mode: aggregated # atomic, aggregated, multi_hop, cot
   data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/models/partitioner/ece_partitioner.py b/graphgen/models/partitioner/ece_partitioner.py
@@ -55,7 +55,7 @@ async def partition(
         g: BaseGraphStorage,
         max_units_per_community: int = 10,
         max_tokens_per_community: int = 10240,
-        edge_sampling: str = "random",
+        unit_sampling: str = "random",
         **kwargs: Any,
     ) -> List[Community]:
         nodes: List[Tuple[str, dict]] = await g.get_all_nodes()
@@ -73,7 +73,7 @@ async def partition(
         used_e: Set[frozenset[str]] = set()
         communities: List = []
 
-        all_units = self._sort_units(all_units, edge_sampling)
+        all_units = self._sort_units(all_units, unit_sampling)
 
         async def _grow_community(seed_unit: Tuple[str, Any, dict]) -> Community:
             nonlocal used_n, used_e
@@ -124,7 +124,7 @@ async def _add_unit(u):
                         if n_id not in used_n and n_id not in community_nodes:
                             neighbors.append(("n", n_id, node_dict[n_id]))
 
-                neighbors = self._sort_units(neighbors, edge_sampling)
+                neighbors = self._sort_units(neighbors, unit_sampling)
                 for nb in neighbors:
                     if (
                         len(community_nodes) + len(community_edges)
diff --git a/graphgen/operators/partition/partition_kg.py b/graphgen/operators/partition/partition_kg.py
@@ -30,7 +30,7 @@ async def partition_kg(
     elif method == "ece":
         logger.info("Partitioning knowledge graph using ECE method.")
         # TODO： before ECE partitioning, we need to:
-        # 1. 'quiz and judge' to get the comprehension loss
+        # 1. 'quiz and judge' to get the comprehension loss if unit_sampling is not random
         # 2. pre-tokenize nodes and edges to get the token length
         edges = await kg_instance.get_all_edges()
         nodes = await kg_instance.get_all_nodes()
diff --git a/tests/integration_tests/models/partitioner/test_ece_partitioner.py b/tests/integration_tests/models/partitioner/test_ece_partitioner.py
@@ -0,0 +1,202 @@
+import tempfile
+
+import pytest
+
+from graphgen.bases.datatypes import Community
+from graphgen.models import ECEPartitioner, NetworkXStorage
+
+
+@pytest.mark.asyncio
+async def test_ece_empty_graph():
+    """ECE partitioning on an empty graph should return an empty community list."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="empty")
+        partitioner = ECEPartitioner()
+        communities = await partitioner.partition(
+            storage, max_units_per_community=5, unit_sampling="random"
+        )
+        assert communities == []
+
+
+@pytest.mark.asyncio
+async def test_ece_single_node():
+    """A single node must be placed in exactly one community under any edge-sampling strategy."""
+    nodes = [("A", {"desc": "alone", "length": 10, "loss": 0.1})]
+
+    for strategy in ("random", "min_loss", "max_loss"):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            storage = NetworkXStorage(
+                working_dir=tmpdir, namespace=f"single_{strategy}"
+            )
+            for nid, ndata in nodes:
+                await storage.upsert_node(nid, ndata)
+
+            partitioner = ECEPartitioner()
+            communities: list[Community] = await partitioner.partition(
+                storage, max_units_per_community=5, unit_sampling=strategy
+            )
+            assert len(communities) == 1
+            assert communities[0].nodes == ["A"]
+            assert communities[0].edges == []
+
+
+@pytest.mark.asyncio
+async def test_ece_small_graph_random():
+    """
+    2x3 grid graph:
+        0 — 1 — 2
+        |   |   |
+        3 — 4 — 5
+    6 nodes & 7 edges, max_units=4  =>  at least 3 communities expected with random sampling.
+    """
+    nodes = [(str(i), {"desc": f"node{i}", "length": 10}) for i in range(6)]
+    edges = [
+        ("0", "1", {"desc": "e01", "loss": 0.1, "length": 5}),
+        ("1", "2", {"desc": "e12", "loss": 0.2, "length": 5}),
+        ("0", "3", {"desc": "e03", "loss": 0.3, "length": 5}),
+        ("1", "4", {"desc": "e14", "loss": 0.4, "length": 5}),
+        ("2", "5", {"desc": "e25", "loss": 0.5, "length": 5}),
+        ("3", "4", {"desc": "e34", "loss": 0.6, "length": 5}),
+        ("4", "5", {"desc": "e45", "loss": 0.7, "length": 5}),
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="small_random")
+        for nid, ndata in nodes:
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = ECEPartitioner()
+        communities: list[Community] = await partitioner.partition(
+            storage, max_units_per_community=4, unit_sampling="random"
+        )
+
+        # Basic integrity checks
+        all_nodes = set()
+        all_edges = set()
+        for c in communities:
+            assert len(c.nodes) + len(c.edges) <= 4
+            all_nodes.update(c.nodes)
+            all_edges.update((u, v) if u < v else (v, u) for u, v in c.edges)
+        assert all_nodes == {str(i) for i in range(6)}
+        assert len(all_edges) == 7
+
+
+@pytest.mark.asyncio
+async def test_ece_small_graph_min_loss():
+    """
+    Same grid graph, but using min_loss sampling.
+    Edges with lower loss should be preferred during community expansion.
+    """
+    nodes = [
+        (str(i), {"desc": f"node{i}", "length": 10, "loss": i * 0.1}) for i in range(6)
+    ]
+    edges = [
+        ("0", "1", {"desc": "e01", "loss": 0.05, "length": 5}),
+        ("1", "2", {"desc": "e12", "loss": 0.10, "length": 5}),
+        ("0", "3", {"desc": "e03", "loss": 0.15, "length": 5}),
+        ("1", "4", {"desc": "e14", "loss": 0.20, "length": 5}),
+        ("2", "5", {"desc": "e25", "loss": 0.25, "length": 5}),
+        ("3", "4", {"desc": "e34", "loss": 0.30, "length": 5}),
+        ("4", "5", {"desc": "e45", "loss": 0.35, "length": 5}),
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="small_min")
+        for nid, ndata in nodes:
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = ECEPartitioner()
+        communities: list[Community] = await partitioner.partition(
+            storage, max_units_per_community=4, unit_sampling="min_loss"
+        )
+
+        all_nodes = set()
+        all_edges = set()
+        for c in communities:
+            assert len(c.nodes) + len(c.edges) <= 4
+            all_nodes.update(c.nodes)
+            all_edges.update((u, v) if u < v else (v, u) for u, v in c.edges)
+        assert all_nodes == {str(i) for i in range(6)}
+        assert len(all_edges) == 7
+
+
+@pytest.mark.asyncio
+async def test_ece_small_graph_max_loss():
+    """
+    Same grid graph, but using max_loss sampling.
+    Edges with higher loss should be preferred during community expansion.
+    """
+    nodes = [
+        (str(i), {"desc": f"node{i}", "length": 10, "loss": (5 - i) * 0.1})
+        for i in range(6)
+    ]
+    edges = [
+        ("0", "1", {"desc": "e01", "loss": 0.35, "length": 5}),
+        ("1", "2", {"desc": "e12", "loss": 0.30, "length": 5}),
+        ("0", "3", {"desc": "e03", "loss": 0.25, "length": 5}),
+        ("1", "4", {"desc": "e14", "loss": 0.20, "length": 5}),
+        ("2", "5", {"desc": "e25", "loss": 0.15, "length": 5}),
+        ("3", "4", {"desc": "e34", "loss": 0.10, "length": 5}),
+        ("4", "5", {"desc": "e45", "loss": 0.05, "length": 5}),
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="small_max")
+        for nid, ndata in nodes:
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = ECEPartitioner()
+        communities: list[Community] = await partitioner.partition(
+            storage, max_units_per_community=4, unit_sampling="max_loss"
+        )
+
+        all_nodes = set()
+        all_edges = set()
+        for c in communities:
+            assert len(c.nodes) + len(c.edges) <= 4
+            all_nodes.update(c.nodes)
+            all_edges.update((u, v) if u < v else (v, u) for u, v in c.edges)
+        assert all_nodes == {str(i) for i in range(6)}
+        assert len(all_edges) == 7
+
+
+@pytest.mark.asyncio
+async def test_ece_max_tokens_limit():
+    """Ensure max_tokens_per_community is respected."""
+    # node id -> data
+    node_data = {"A": {"length": 3000}, "B": {"length": 3000}, "C": {"length": 3000}}
+    # edge list
+    edges = [("A", "B", {"loss": 0.1, "length": 2000})]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        storage = NetworkXStorage(working_dir=tmpdir, namespace="token_limit")
+        for nid, ndata in node_data.items():
+            await storage.upsert_node(nid, ndata)
+        for src, tgt, edata in edges:
+            await storage.upsert_edge(src, tgt, edata)
+
+        partitioner = ECEPartitioner()
+        communities: list[Community] = await partitioner.partition(
+            storage,
+            max_units_per_community=10,
+            max_tokens_per_community=5000,  # 1 node (3000) + 1 edge (2000) = 5000
+            edge_sampling="random",
+        )
+
+        # With a 5000-token budget we need at least two communities
+        assert len(communities) >= 2
+
+        # helper: quick edge lookup
+        edge_lens = {(u, v): d["length"] for u, v, d in edges}
+        edge_lens.update({(v, u): d["length"] for u, v, d in edges})  # undirected
+
+        for c in communities:
+            node_tokens = sum(node_data[n]["length"] for n in c.nodes)
+            edge_tokens = sum(edge_lens[e] for e in c.edges)
+            assert node_tokens + edge_tokens <= 5000