s-heppner · s-heppner · Oct 30, 2025 · Oct 30, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -177,3 +177,69 @@ jobs:
       - name: Check code style with PyCodestyle
         run: |
           pycodestyle --count --max-line-length 120 src/smr_discovery tests
+
+  alignment_build:
+    # This job checks if the build succeeds
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: ./smr_alignment
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Set up Python ${{ env.X_PYTHON_VERSION }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.X_PYTHON_VERSION }}
+      - name: Manually install local dependencies
+        run: pip install ../semantic_match_registry
+      - name: Build the package
+        run: pip install .
+
+  alignment_test:
+    # This job runs the unittests
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: ./smr_alignment
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ env.X_PYTHON_VERSION }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.X_PYTHON_VERSION }}
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install ../semantic_match_registry
+          pip install .[dev]
+
+      - name: Run Python Tests
+        run: python -m unittest discover
+
+  alignment_static-analysis:
+    # This job runs static code analysis, namely pycodestyle and mypy
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: ./smr_alignment
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ env.X_PYTHON_VERSION }}
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install ../semantic_match_registry
+          pip install .[dev]
+      - name: Check typing with MyPy
+        run: |
+          mypy src/smr_alignment tests
+      - name: Check code style with PyCodestyle
+        run: |
+          pycodestyle --count --max-line-length 120 src/smr_alignment tests
diff --git a/README.md b/README.md
@@ -13,4 +13,5 @@ RWTH Aachen university.
 
 - [semantic_match_registry](/semantic_match_registry): Semantic Match Registry (SMR) service
 - [smr_discovery](/smr_discovery): Discovery service for SMRs
+- [smr_alignment](/smr_alignment): Toolset for aligning real world data to the SMR concept
 
diff --git a/smr_alignment/README.md b/smr_alignment/README.md
@@ -0,0 +1,43 @@
+# SMR Alignment
+
+This package contains tools for checking the alignment of data to the requirements of the Semantic Match Registry 
+concept.
+
+## Check for Semantic Similarity Triangle Violations
+
+A tool for finding *Semantic Similarity Triangle Violations* is located in: `triangle_violation_checker.py`.
+
+You can use it as follows, to check a given `smr.algorithm.SemanticMatchGraph` for violations:
+
+```python
+from smr.algorithm import SemanticMatchGraph
+from smr_alignment.triangle_violation_checker import TriangleViolationChecker
+
+smg: SemanticMatchGraph = SemanticMatchGraph()  # Your SemanticMatchGraph
+tvc: TriangleViolationChecker = TriangleViolationChecker(smg)
+
+# The TriangleViolationChecker will automatically calculate the log-costs of the SemanticMatchGraph 
+# during initialization.
+# You then have to decide whether to use Floyd-Warshall or Dijkstra for calculating the repaired costs 
+# (and therefore the violations):
+
+tvc.calculate_repaired_costs_floyd_warshall()  # Use with dense (and small-ish) graphs
+tvc.calculate_repaired_costs_dijkstra()  # Use with sparse (and larger) graphs
+
+# Both algorithms are tested to give the same results (up to 12 places behind the comma)
+
+# Finally: Transform back to similarity space for the repaired similarities:
+tvc.transform_back_to_similarity()
+
+# Now, smg has the repaired weights:
+
+for u,v, data in smg.edges(data=True):
+    print(data["weight"])
+
+    # Furthermore, we have some more practical data for each edge:
+    print(data["original_similarity"])  # The original, unmodified similarity
+    print(data["log_cost"])             # The similarity score transformed into log space
+    print(data["log_cost_repaired"])    # The repaired log_cost, made fit with the inherent semantic model of the graph
+    print(data["violation"])            # True, if a violation was found else False
+    print(data["repaired_similarity"])  # The repaired similarity cost (should be equal to "weight")
+```
diff --git a/smr_alignment/pyproject.toml b/smr_alignment/pyproject.toml
@@ -0,0 +1,32 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "smr_alignment"
+version = "0.1.0"
+description = "Toolset to align real world data to align with the SMR concept, implemented in Python"
+readme = "README.md"
+requires-python = ">=3.11"
+authors = [{ name = "Sebastian Heppner", email = "mail@s-heppner.com" }]
+dependencies = [
+  "semantic_match_registry>=0.0.1",
+  "networkx>=3.4.2",
+  "tqdm>=4.46.1",
+]
+
+[project.optional-dependencies]
+dev = [
+  "mypy",
+  "pycodestyle",
+  "coverage",
+  "types-networkx",
+  "types-tqdm",
+]
+
+[tool.setuptools.package-dir]
+"" = "src"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["smr_alignment*"]
diff --git a/smr_alignment/src/smr_alignment/__init__.py b/smr_alignment/src/smr_alignment/__init__.py
diff --git a/smr_alignment/src/smr_alignment/py.typed b/smr_alignment/src/smr_alignment/py.typed
diff --git a/smr_alignment/src/smr_alignment/triangle_violation_checker.py b/smr_alignment/src/smr_alignment/triangle_violation_checker.py
@@ -0,0 +1,113 @@
+"""
+This module implements a checker to flag all semantic similarity triangle violations, e.g. cases where:
+
+s(A, C) <= s(A, B) * s(B, C)
+
+These values violate the inherent semantic model of the graph and lead to it being not max-product transitive,
+a requirement for the SMR concept.
+
+For more details, I'd like to refer to my dissertation.
+"""
+import math
+from tqdm import tqdm
+
+import networkx as nx
+
+from smr.algorithm import SemanticMatchGraph
+
+
+class TriangleViolationChecker:
+    def __init__(self, semantic_match_graph: SemanticMatchGraph) -> None:
+        self.semantic_match_graph: SemanticMatchGraph = semantic_match_graph
+        self.add_log_cost_to_graph()
+
+    def add_log_cost_to_graph(self, epsilon: float = 1e-8) -> None:
+        """
+        Adds a log-space 'log_cost' edge attribute to the graph, in place.
+
+        Clamps edge weights between (epsilon, 1 - epsilon) to avoid
+        logarithmic singularities.
+        """
+        edges = tqdm(
+            self.semantic_match_graph.edges(data=True),
+            total=self.semantic_match_graph.number_of_edges(),
+            desc="Transforming to log-space",
+        )
+
+        for u, v, data in edges:
+            # get original similarity
+            s = float(data["weight"])
+            data["original_similarity"] = s
+
+            # clamp into (epsilon, 1 - epsilon)
+            s_clamped = min(max(s, epsilon), 1 - epsilon)
+
+            # avoid recomputing max() inside log, since it's already clamped
+            data["log_cost"] = -math.log(s_clamped)
+
+    def calculate_repaired_costs_floyd_warshall(self) -> None:
+        """
+        Repair costs using all-pairs shortest paths (Floyd–Warshall).
+        Sets log_cost_repaired = min(direct_cost, shortest_path_cost) in the graph.
+
+        Note: Floyd–Warshall is simple and cubic in node count. Use it when the graph is small-ish or fairly dense.
+        """
+        print("Calculating distances using Floyd-Warshall. This may take a while (O(n^3)).")
+        distances = dict(nx.floyd_warshall(self.semantic_match_graph, weight="log_cost"))
+
+        # Apply repairs with a progress bar over edges
+        edges = tqdm(
+            self.semantic_match_graph.edges(data=True),
+            total=self.semantic_match_graph.number_of_edges(),
+            desc="Applying Floyd-Warshall repaired costs",
+        )
+        for u, v, data in edges:
+            direct = float(data["log_cost"])
+            shortest = distances[u][v]  # float('inf') if disconnected
+            repaired = min(direct, shortest)
+            data["log_cost_repaired"] = repaired
+            data["violation"] = True if repaired < direct else False
+
+    def calculate_repaired_costs_dijkstra(self) -> None:
+        """
+        Repair costs using repeated single-source Dijkstra.
+        Sets log_cost_repaired = min(direct_cost, shortest_path_cost) in the graph.
+
+        Note: Better for large, sparse graphs. Roughly O(m * log n) per source.
+        """
+        nodes = self.semantic_match_graph.nodes
+
+        for u in tqdm(nodes, total=len(nodes), desc="Running Dijkstra per node"):
+            dists = nx.single_source_dijkstra_path_length(self.semantic_match_graph, source=u, weight="log_cost")
+            for _, v, data in self.semantic_match_graph.out_edges(u, data=True):
+                direct = float(data["log_cost"])
+                shortest = dists.get(v, float("inf"))  # inf if unreachable
+                repaired = min(direct, shortest)
+                data["log_cost_repaired"] = repaired
+                data["violation"] = True if repaired < direct else False
+
+    def transform_back_to_similarity(self) -> None:
+        """
+        Transforms repaired log-costs back into similarity scores.
+
+        For each edge:
+            repaired_similarity = exp(-log_cost_repaired)
+            weight = repaired_similarity
+        """
+        edges = tqdm(
+            self.semantic_match_graph.edges(data=True),
+            total=self.semantic_match_graph.number_of_edges(),
+            desc="Transforming back to similarity space",
+        )
+
+        for _, _, data in edges:
+            # Retrieve the repaired log-cost (if absent, skip)
+            if "log_cost_repaired" not in data:
+                continue
+
+            repaired_cost = float(data["log_cost_repaired"])
+            repaired_similarity = math.exp(-repaired_cost)
+
+            # Store for reference and update active weight
+            data["repaired_similarity"] = repaired_similarity
+            data["weight"] = repaired_similarity
diff --git a/smr_alignment/tests/__init__.py b/smr_alignment/tests/__init__.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,4 +13,5 @@ RWTH Aachen university.

		- [semantic_match_registry](/semantic_match_registry): Semantic Match Registry (SMR) service
		- [smr_discovery](/smr_discovery): Discovery service for SMRs
		- [smr_alignment](/smr_alignment): Toolset for aligning real world data to the SMR concept