From 1a68fa562d91f996a6082dd1b6139ac87fa6712c Mon Sep 17 00:00:00 2001 From: Marco Porcellato Date: Mon, 29 Jun 2026 19:18:09 +0200 Subject: [PATCH] fix(synapse): truncate cyclic page embed chains without text duplication Replace visited_pages discard with an immutable embed_page_chain seeded from the host page in to_context_enriched_chunks. Closes #65 Co-authored-by: Cursor --- CHANGELOG.md | 4 +++ docs/ARCHITECTURE.md | 2 +- src/logseq_matryca_parser/synapse.py | 51 ++++++++++++++++------------ tests/test_synapse.py | 19 +++++++++++ 4 files changed, 54 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3899bc8..eb87044 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- **SYNAPSE** — cyclic `{{embed [[Page]]}}` chains no longer duplicate parent literal text; page embed expansion tracks an immutable host-page chain seeded from `to_context_enriched_chunks` ([#65](https://github.com/MarcoPorcellato/logseq-matryca-parser/issues/65)). + ### Added - **Contributor issues (wave 3)** — Six new issues from local code study ([#59](https://github.com/MarcoPorcellato/logseq-matryca-parser/issues/59)–[#64](https://github.com/MarcoPorcellato/logseq-matryca-parser/issues/64)): LENS ghost wikilink nodes, corrupt X-Ray state handling, `agent_write` assert guard, and paired good-first tests. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index b967f04..b33545a 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -270,7 +270,7 @@ Beyond flat `Document` emission, **`to_context_enriched_chunks`** targets **vect 1. **Breadcrumbs.** [`_build_breadcrumbs`](../src/logseq_matryca_parser/synapse.py) walks the owning `LogseqPage` and the node’s UUID `path` so the chunk’s visible text carries **human-readable lineage** (page title + ancestor outline), not just an opaque `parent_id`. -2. **Recursive macro / embed expansion.** [`_expand_macros_and_embeds`](../src/logseq_matryca_parser/synapse.py) operates on **`node.content`** (not `clean_text`) so tokens hidden from embeddings—such as `((uuid))` inside `{{embed ((uuid))}}`—remain visible to the scanner. It expands **`{{embed ((uuid))}}`** by inlining the target block’s content (with **per-UUID cycle detection**) and **`{{embed [[Page]]}}`** by inlining page bodies via **`LogseqGraph.get_page`** (case-insensitive, with **per-title cycle detection**). Unresolved embed targets yield **empty replacement** instead of hanging the export pipeline (v1.4.0). +2. **Recursive macro / embed expansion.** [`_expand_macros_and_embeds`](../src/logseq_matryca_parser/synapse.py) operates on **`node.content`** (not `clean_text`) so tokens hidden from embeddings—such as `((uuid))` inside `{{embed ((uuid))}}`—remain visible to the scanner. It expands **`{{embed ((uuid))}}`** by inlining the target block’s content (with **per-UUID cycle detection**) and **`{{embed [[Page]]}}`** by inlining page bodies via **`LogseqGraph.get_page`** (case-insensitive, with **per-title embed-chain detection** — the host page title seeds an immutable chain so inter-page cycles truncate at the re-entrant edge instead of re-inlining parent literals). Unresolved embed targets yield **empty replacement** instead of hanging the export pipeline (v1.4.0). 3. **Org-mode-style property inheritance.** Metadata includes **`effective_properties`**: the merge produced by [`LogseqGraph.get_effective_properties`](../src/logseq_matryca_parser/graph.py) — **page frontmatter first**, then each ancestor on `node.path` **top-down**, with deeper `LogseqNode.properties` **overriding** shallower keys. Downstream filters can therefore key off inherited `type::`, `status::`, etc., without re-walking the outline at query time. diff --git a/src/logseq_matryca_parser/synapse.py b/src/logseq_matryca_parser/synapse.py index e690ab3..545792e 100644 --- a/src/logseq_matryca_parser/synapse.py +++ b/src/logseq_matryca_parser/synapse.py @@ -145,22 +145,28 @@ def _strip_markdown_for_embedding(text: str) -> str: return s -def _expand_macros_and_embeds(text: str, graph: LogseqGraph, visited_uuids: set[str]) -> str: +def _expand_macros_and_embeds( + text: str, + graph: LogseqGraph, + visited_uuids: set[str], + *, + embed_page_chain: frozenset[str] = frozenset(), +) -> str: """Expand ``{{embed ((uuid))}}`` / ``{{embed [[page]]}}`` for RAG text. Operates on raw block ``content`` (not ``clean_text``) so ``((uuid))`` inside macros is still visible to the scanner after parsing. """ - return _expand_macros_and_embeds_impl(text, graph, visited_uuids, set()) + return _expand_macros_and_embeds_impl(text, graph, visited_uuids, embed_page_chain) def _expand_macros_and_embeds_impl( text: str, graph: LogseqGraph, visited_uuids: set[str], - visited_pages: set[str], + embed_page_chain: frozenset[str], ) -> str: - """Shared worker: ``visited_uuids`` breaks block cycles; ``visited_pages`` breaks page cycles.""" + """Shared worker: ``visited_uuids`` breaks block cycles; ``embed_page_chain`` breaks page cycles.""" result = text while True: bm = _BLOCK_EMBED_PATTERN.search(result) @@ -185,7 +191,7 @@ def _expand_macros_and_embeds_impl( next_seen = set(visited_uuids) next_seen.add(uid) replacement = _expand_macros_and_embeds_impl( - target.content, graph, next_seen, visited_pages + target.content, graph, next_seen, embed_page_chain ) result = result[: match.start()] + replacement + result[match.end() :] else: @@ -195,27 +201,24 @@ def _expand_macros_and_embeds_impl( title = match.group("title").strip() page = graph.get_page(title) canonical_title = page.title if page is not None else title - if canonical_title in visited_pages: + if canonical_title in embed_page_chain: logger.debug("Stack-Machine embed: cyclic page title=%s", canonical_title) replacement = "" elif page is None: logger.debug("Stack-Machine embed: unknown page title=%s", title) replacement = "" else: - visited_pages.add(canonical_title) - try: - shared_blocks = set(visited_uuids) - pieces: list[str] = [] - for n in _flatten_nodes_for_export(page.root_nodes): - frag = _expand_macros_and_embeds_impl( - n.content, graph, shared_blocks, visited_pages - ) - stripped = frag.strip() - if stripped: - pieces.append(stripped) - replacement = "\n".join(pieces) - finally: - visited_pages.discard(canonical_title) + next_chain = embed_page_chain | frozenset({canonical_title}) + shared_blocks = set(visited_uuids) + pieces: list[str] = [] + for n in _flatten_nodes_for_export(page.root_nodes): + frag = _expand_macros_and_embeds_impl( + n.content, graph, shared_blocks, next_chain + ) + stripped = frag.strip() + if stripped: + pieces.append(stripped) + replacement = "\n".join(pieces) result = result[: match.start()] + replacement + result[match.end() :] return result @@ -404,7 +407,13 @@ def to_context_enriched_chunks( continue breadcrumbs, page = _build_breadcrumbs(graph, node) source_name = Path(node.source_path).name if node.source_path else str(graph.graph_path.name) - expanded_content = _expand_macros_and_embeds(node.content, graph, set()) + host_page = graph.page_for_node(node) + embed_chain = ( + frozenset({host_page.title}) if host_page is not None else frozenset() + ) + expanded_content = _expand_macros_and_embeds( + node.content, graph, set(), embed_page_chain=embed_chain + ) page_content = format_template.format( breadcrumbs=breadcrumbs, content=expanded_content, diff --git a/tests/test_synapse.py b/tests/test_synapse.py index bdb012c..d54f83b 100644 --- a/tests/test_synapse.py +++ b/tests/test_synapse.py @@ -335,6 +335,25 @@ def test_expand_page_embed_resolves_case_insensitive_title(tmp_path: Path) -> No assert "shared body" in expanded +def test_expand_cyclic_page_embed_does_not_duplicate_parent_text(tmp_path: Path) -> None: + """A embeds B embeds A must not re-inline parent literal text (#65).""" + from logseq_matryca_parser.synapse import _expand_macros_and_embeds + + graph_root = tmp_path / "vault" + pages = graph_root / "pages" + pages.mkdir(parents=True) + (pages / "A.md").write_text("- before {{embed [[B]]}} after\n", encoding="utf-8") + (pages / "B.md").write_text("- inner {{embed [[A]]}}\n", encoding="utf-8") + graph = LogseqGraph.load_directory(graph_root) + host_page = graph.pages["A"] + text = host_page.root_nodes[0].content + chain = frozenset({host_page.title}) + + expanded = _expand_macros_and_embeds(text, graph, set(), embed_page_chain=chain) + + assert expanded.strip() == "before inner after" + + def test_expand_missing_block_embed_completes_without_hang(tmp_path: Path) -> None: from logseq_matryca_parser.synapse import _expand_macros_and_embeds