diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 727ce30..80f3370 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -177,3 +177,69 @@ jobs: - name: Check code style with PyCodestyle run: | pycodestyle --count --max-line-length 120 src/smr_discovery tests + + alignment_build: + # This job checks if the build succeeds + runs-on: ubuntu-latest + defaults: + run: + working-directory: ./smr_alignment + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Set up Python ${{ env.X_PYTHON_VERSION }} + uses: actions/setup-python@v4 + with: + python-version: ${{ env.X_PYTHON_VERSION }} + - name: Manually install local dependencies + run: pip install ../semantic_match_registry + - name: Build the package + run: pip install . + + alignment_test: + # This job runs the unittests + runs-on: ubuntu-latest + defaults: + run: + working-directory: ./smr_alignment + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ env.X_PYTHON_VERSION }} + uses: actions/setup-python@v4 + with: + python-version: ${{ env.X_PYTHON_VERSION }} + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install ../semantic_match_registry + pip install .[dev] + + - name: Run Python Tests + run: python -m unittest discover + + alignment_static-analysis: + # This job runs static code analysis, namely pycodestyle and mypy + runs-on: ubuntu-latest + defaults: + run: + working-directory: ./smr_alignment + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: ${{ env.X_PYTHON_VERSION }} + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install ../semantic_match_registry + pip install .[dev] + - name: Check typing with MyPy + run: | + mypy src/smr_alignment tests + - name: Check code style with PyCodestyle + run: | + pycodestyle --count --max-line-length 120 src/smr_alignment tests diff --git a/README.md b/README.md index 2c46dab..676d3af 100644 --- a/README.md +++ b/README.md @@ -13,4 +13,5 @@ RWTH Aachen university. - [semantic_match_registry](/semantic_match_registry): Semantic Match Registry (SMR) service - [smr_discovery](/smr_discovery): Discovery service for SMRs +- [smr_alignment](/smr_alignment): Toolset for aligning real world data to the SMR concept diff --git a/smr_alignment/README.md b/smr_alignment/README.md new file mode 100644 index 0000000..585a562 --- /dev/null +++ b/smr_alignment/README.md @@ -0,0 +1,43 @@ +# SMR Alignment + +This package contains tools for checking the alignment of data to the requirements of the Semantic Match Registry +concept. + +## Check for Semantic Similarity Triangle Violations + +A tool for finding *Semantic Similarity Triangle Violations* is located in: `triangle_violation_checker.py`. + +You can use it as follows, to check a given `smr.algorithm.SemanticMatchGraph` for violations: + +```python +from smr.algorithm import SemanticMatchGraph +from smr_alignment.triangle_violation_checker import TriangleViolationChecker + +smg: SemanticMatchGraph = SemanticMatchGraph() # Your SemanticMatchGraph +tvc: TriangleViolationChecker = TriangleViolationChecker(smg) + +# The TriangleViolationChecker will automatically calculate the log-costs of the SemanticMatchGraph +# during initialization. +# You then have to decide whether to use Floyd-Warshall or Dijkstra for calculating the repaired costs +# (and therefore the violations): + +tvc.calculate_repaired_costs_floyd_warshall() # Use with dense (and small-ish) graphs +tvc.calculate_repaired_costs_dijkstra() # Use with sparse (and larger) graphs + +# Both algorithms are tested to give the same results (up to 12 places behind the comma) + +# Finally: Transform back to similarity space for the repaired similarities: +tvc.transform_back_to_similarity() + +# Now, smg has the repaired weights: + +for u,v, data in smg.edges(data=True): + print(data["weight"]) + + # Furthermore, we have some more practical data for each edge: + print(data["original_similarity"]) # The original, unmodified similarity + print(data["log_cost"]) # The similarity score transformed into log space + print(data["log_cost_repaired"]) # The repaired log_cost, made fit with the inherent semantic model of the graph + print(data["violation"]) # True, if a violation was found else False + print(data["repaired_similarity"]) # The repaired similarity cost (should be equal to "weight") +``` diff --git a/smr_alignment/pyproject.toml b/smr_alignment/pyproject.toml new file mode 100644 index 0000000..cb02aa9 --- /dev/null +++ b/smr_alignment/pyproject.toml @@ -0,0 +1,32 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "smr_alignment" +version = "0.1.0" +description = "Toolset to align real world data to align with the SMR concept, implemented in Python" +readme = "README.md" +requires-python = ">=3.11" +authors = [{ name = "Sebastian Heppner", email = "mail@s-heppner.com" }] +dependencies = [ + "semantic_match_registry>=0.0.1", + "networkx>=3.4.2", + "tqdm>=4.46.1", +] + +[project.optional-dependencies] +dev = [ + "mypy", + "pycodestyle", + "coverage", + "types-networkx", + "types-tqdm", +] + +[tool.setuptools.package-dir] +"" = "src" + +[tool.setuptools.packages.find] +where = ["src"] +include = ["smr_alignment*"] diff --git a/smr_alignment/src/smr_alignment/__init__.py b/smr_alignment/src/smr_alignment/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/smr_alignment/src/smr_alignment/py.typed b/smr_alignment/src/smr_alignment/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/smr_alignment/src/smr_alignment/triangle_violation_checker.py b/smr_alignment/src/smr_alignment/triangle_violation_checker.py new file mode 100644 index 0000000..d5678b7 --- /dev/null +++ b/smr_alignment/src/smr_alignment/triangle_violation_checker.py @@ -0,0 +1,113 @@ +""" +This module implements a checker to flag all semantic similarity triangle violations, e.g. cases where: + +s(A, C) <= s(A, B) * s(B, C) + +These values violate the inherent semantic model of the graph and lead to it being not max-product transitive, +a requirement for the SMR concept. + +For more details, I'd like to refer to my dissertation. +""" +import math +from tqdm import tqdm + +import networkx as nx + +from smr.algorithm import SemanticMatchGraph + + +class TriangleViolationChecker: + def __init__(self, semantic_match_graph: SemanticMatchGraph) -> None: + self.semantic_match_graph: SemanticMatchGraph = semantic_match_graph + self.add_log_cost_to_graph() + + def add_log_cost_to_graph(self, epsilon: float = 1e-8) -> None: + """ + Adds a log-space 'log_cost' edge attribute to the graph, in place. + + Clamps edge weights between (epsilon, 1 - epsilon) to avoid + logarithmic singularities. + """ + edges = tqdm( + self.semantic_match_graph.edges(data=True), + total=self.semantic_match_graph.number_of_edges(), + desc="Transforming to log-space", + ) + + for u, v, data in edges: + # get original similarity + s = float(data["weight"]) + data["original_similarity"] = s + + # clamp into (epsilon, 1 - epsilon) + s_clamped = min(max(s, epsilon), 1 - epsilon) + + # avoid recomputing max() inside log, since it's already clamped + data["log_cost"] = -math.log(s_clamped) + + def calculate_repaired_costs_floyd_warshall(self) -> None: + """ + Repair costs using all-pairs shortest paths (Floyd–Warshall). + Sets log_cost_repaired = min(direct_cost, shortest_path_cost) in the graph. + + Note: Floyd–Warshall is simple and cubic in node count. Use it when the graph is small-ish or fairly dense. + """ + print("Calculating distances using Floyd-Warshall. This may take a while (O(n^3)).") + distances = dict(nx.floyd_warshall(self.semantic_match_graph, weight="log_cost")) + + # Apply repairs with a progress bar over edges + edges = tqdm( + self.semantic_match_graph.edges(data=True), + total=self.semantic_match_graph.number_of_edges(), + desc="Applying Floyd-Warshall repaired costs", + ) + for u, v, data in edges: + direct = float(data["log_cost"]) + shortest = distances[u][v] # float('inf') if disconnected + repaired = min(direct, shortest) + data["log_cost_repaired"] = repaired + data["violation"] = True if repaired < direct else False + + def calculate_repaired_costs_dijkstra(self) -> None: + """ + Repair costs using repeated single-source Dijkstra. + Sets log_cost_repaired = min(direct_cost, shortest_path_cost) in the graph. + + Note: Better for large, sparse graphs. Roughly O(m * log n) per source. + """ + nodes = self.semantic_match_graph.nodes + + for u in tqdm(nodes, total=len(nodes), desc="Running Dijkstra per node"): + dists = nx.single_source_dijkstra_path_length(self.semantic_match_graph, source=u, weight="log_cost") + for _, v, data in self.semantic_match_graph.out_edges(u, data=True): + direct = float(data["log_cost"]) + shortest = dists.get(v, float("inf")) # inf if unreachable + repaired = min(direct, shortest) + data["log_cost_repaired"] = repaired + data["violation"] = True if repaired < direct else False + + def transform_back_to_similarity(self) -> None: + """ + Transforms repaired log-costs back into similarity scores. + + For each edge: + repaired_similarity = exp(-log_cost_repaired) + weight = repaired_similarity + """ + edges = tqdm( + self.semantic_match_graph.edges(data=True), + total=self.semantic_match_graph.number_of_edges(), + desc="Transforming back to similarity space", + ) + + for _, _, data in edges: + # Retrieve the repaired log-cost (if absent, skip) + if "log_cost_repaired" not in data: + continue + + repaired_cost = float(data["log_cost_repaired"]) + repaired_similarity = math.exp(-repaired_cost) + + # Store for reference and update active weight + data["repaired_similarity"] = repaired_similarity + data["weight"] = repaired_similarity diff --git a/smr_alignment/tests/__init__.py b/smr_alignment/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/smr_alignment/tests/test_triangle_violation_checker.py b/smr_alignment/tests/test_triangle_violation_checker.py new file mode 100644 index 0000000..f00f88f --- /dev/null +++ b/smr_alignment/tests/test_triangle_violation_checker.py @@ -0,0 +1,154 @@ +import math +import unittest + +from smr.algorithm import SemanticMatchGraph +from smr_alignment.triangle_violation_checker import TriangleViolationChecker + + +TOLERANCE = 1e-12 +EPSILON = 1e-8 + + +class TestTriangleViolationChecker(unittest.TestCase): + def make_graph_with_triangle_violation(self): + """ + Build a tiny graph where: + s(A,B)=0.9, s(B,C)=0.9, s(A,C)=0.5 + This violates s(A,C) >= s(A,B)*s(B,C) because 0.5 < 0.81. + """ + G = SemanticMatchGraph() + G.add_semantic_match("A", "B", 0.9) + G.add_semantic_match("B", "C", 0.9) + G.add_semantic_match("A", "C", 0.5) + return G + + def make_graph_consistent_triangle(self): + """ + Build a tiny graph where: + s(A,B)=0.5, s(B,C)=0.5, s(A,C)=0.3 + This is consistent because 0.3 >= 0.25. + """ + G = SemanticMatchGraph() + G.add_semantic_match("A", "B", 0.5) + G.add_semantic_match("B", "C", 0.5) + G.add_semantic_match("A", "C", 0.3) + return G + + def test_add_log_cost_clamps_both_sides(self): + G = SemanticMatchGraph() + G.add_semantic_match("u", "v", 0.0) # will clamp to EPS + G.add_semantic_match("v", "w", 1.0) # will clamp to 1 - EPSILON + G.add_semantic_match("w", "x", 0.5) # in range + + tvc = TriangleViolationChecker(G) + + # u->v + d = G["u"]["v"] + self.assertAlmostEqual(d["original_similarity"], 0.0, places=12) + self.assertAlmostEqual(d["log_cost"], -math.log(EPSILON), places=8) + + # v->w + d = G["v"]["w"] + self.assertAlmostEqual(d["original_similarity"], 1.0, places=12) + self.assertAlmostEqual(d["log_cost"], -math.log(1 - EPSILON), places=8) + + # w->x + d = G["w"]["x"] + self.assertAlmostEqual(d["original_similarity"], 0.5, places=12) + self.assertAlmostEqual(d["log_cost"], -math.log(0.5), places=12) + + def test_floyd_warshall_repairs_violation_and_sets_flags(self): + G = self.make_graph_with_triangle_violation() + tvc = TriangleViolationChecker(G) + tvc.calculate_repaired_costs_floyd_warshall() + + # A->C should be repaired to product 0.9*0.9 = 0.81 in similarity + # In cost space, repaired = -log(0.81) unless clamping changes it (it doesn't here) + ac = G["A"]["C"] + direct_cost = ac["log_cost"] + repaired_cost = ac["log_cost_repaired"] + + self.assertLess(repaired_cost, direct_cost + TOLERANCE) # improved + expected_cost = -math.log(0.9) + -math.log(0.9) # two-edge path cost + self.assertAlmostEqual(repaired_cost, expected_cost, places=12) + self.assertTrue(ac["violation"]) + + # A->B and B->C should be unchanged (still optimal direct edges) + self.assertFalse(G["A"]["B"]["violation"]) + self.assertFalse(G["B"]["C"]["violation"]) + self.assertAlmostEqual(G["A"]["B"]["log_cost_repaired"], G["A"]["B"]["log_cost"], places=12) + self.assertAlmostEqual(G["B"]["C"]["log_cost_repaired"], G["B"]["C"]["log_cost"], places=12) + + def test_dijkstra_matches_floyd_warshall(self): + # Build two identical copies and run the two methods + G1 = self.make_graph_with_triangle_violation() + G2 = self.make_graph_with_triangle_violation() + + tvc1 = TriangleViolationChecker(G1) + tvc1.calculate_repaired_costs_floyd_warshall() + + tvc2 = TriangleViolationChecker(G2) + tvc2.calculate_repaired_costs_dijkstra() + + # Compare repaired costs edge by edge + for u, v in G1.edges(): + c1 = G1[u][v]["log_cost_repaired"] + c2 = G2[u][v]["log_cost_repaired"] + self.assertAlmostEqual(c1, c2, places=12) + + # flags consistent + self.assertEqual(G1[u][v]["violation"], G2[u][v]["violation"]) + + def test_transform_back_updates_weight_and_stores_similarity(self): + G = self.make_graph_with_triangle_violation() + tvc = TriangleViolationChecker(G) + tvc.calculate_repaired_costs_floyd_warshall() + tvc.transform_back_to_similarity() + + # A->C should now have weight = 0.81, repaired_similarity = 0.81 + ac = G["A"]["C"] + repaired_similarity = ac["repaired_similarity"] + self.assertAlmostEqual(repaired_similarity, 0.9 * 0.9, places=12) + self.assertAlmostEqual(ac["weight"], repaired_similarity, places=12) + + # A->B should remain 0.9 + ab = G["A"]["B"] + self.assertAlmostEqual(ab["repaired_similarity"], ab["weight"], places=12) + self.assertAlmostEqual(ab["weight"], 0.9, places=12) + + def test_consistent_graph_no_change(self): + G = self.make_graph_consistent_triangle() + tvc = TriangleViolationChecker(G) + tvc.calculate_repaired_costs_dijkstra() # either method should keep things + + # No violations expected + for _, _, d in G.edges(data=True): + self.assertIn("log_cost_repaired", d) + self.assertFalse(d["violation"]) + self.assertAlmostEqual(d["log_cost_repaired"], d["log_cost"], places=12) + + # Back transform preserves original weights + original = {(u, v): d["weight"] for u, v, d in G.edges(data=True)} + tvc.transform_back_to_similarity() + for u, v, d in G.edges(data=True): + self.assertAlmostEqual(d["weight"], original[(u, v)], places=12) + self.assertAlmostEqual(d["repaired_similarity"], original[(u, v)], places=12) + + def test_unreachable_nodes_do_not_fake_repairs(self): + # Two components: A->B and C->D. No path A->D etc. + G = SemanticMatchGraph() + G.add_semantic_match("A", "B", 0.6) + G.add_semantic_match("C", "D", 0.7) + + tvc = TriangleViolationChecker(G) + tvc.calculate_repaired_costs_dijkstra() + + # Each edge should keep its direct cost; no violations, since no path can improve it + for _, _, d in G.edges(data=True): + self.assertIn("log_cost_repaired", d) + self.assertAlmostEqual(d["log_cost_repaired"], d["log_cost"], places=12) + self.assertFalse(d["violation"]) + + +if __name__ == "__main__": + unittest.main()