Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed

- **agent-write** — `append_child_to_node` normalizes source files missing a trailing newline before line splice, preventing new bullets from being appended onto the last line ([#72](https://github.com/MarcoPorcellato/logseq-matryca-parser/issues/72)).
- **SYNAPSE** — cyclic `{{embed [[Page]]}}` chains no longer duplicate parent literal text; page embed expansion tracks an immutable host-page chain seeded from `to_context_enriched_chunks` ([#65](https://github.com/MarcoPorcellato/logseq-matryca-parser/issues/65)).

### Added

Expand Down
2 changes: 1 addition & 1 deletion docs/ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ Beyond flat `Document` emission, **`to_context_enriched_chunks`** targets **vect

1. **Breadcrumbs.** [`_build_breadcrumbs`](../src/logseq_matryca_parser/synapse.py) walks the owning `LogseqPage` and the node’s UUID `path` so the chunk’s visible text carries **human-readable lineage** (page title + ancestor outline), not just an opaque `parent_id`.

2. **Recursive macro / embed expansion.** [`_expand_macros_and_embeds`](../src/logseq_matryca_parser/synapse.py) operates on **`node.content`** (not `clean_text`) so tokens hidden from embeddings—such as `((uuid))` inside `{{embed ((uuid))}}`—remain visible to the scanner. It expands **`{{embed ((uuid))}}`** by inlining the target block’s content (with **per-UUID cycle detection**) and **`{{embed [[Page]]}}`** by inlining page bodies via **`LogseqGraph.get_page`** (case-insensitive, with **per-title cycle detection**). Unresolved embed targets yield **empty replacement** instead of hanging the export pipeline (v1.4.0).
2. **Recursive macro / embed expansion.** [`_expand_macros_and_embeds`](../src/logseq_matryca_parser/synapse.py) operates on **`node.content`** (not `clean_text`) so tokens hidden from embeddings—such as `((uuid))` inside `{{embed ((uuid))}}`—remain visible to the scanner. It expands **`{{embed ((uuid))}}`** by inlining the target block’s content (with **per-UUID cycle detection**) and **`{{embed [[Page]]}}`** by inlining page bodies via **`LogseqGraph.get_page`** (case-insensitive, with **per-title embed-chain detection** — the host page title seeds an immutable chain so inter-page cycles truncate at the re-entrant edge instead of re-inlining parent literals). Unresolved embed targets yield **empty replacement** instead of hanging the export pipeline (v1.4.0).

3. **Org-mode-style property inheritance.** Metadata includes **`effective_properties`**: the merge produced by [`LogseqGraph.get_effective_properties`](../src/logseq_matryca_parser/graph.py) — **page frontmatter first**, then each ancestor on `node.path` **top-down**, with deeper `LogseqNode.properties` **overriding** shallower keys. Downstream filters can therefore key off inherited `type::`, `status::`, etc., without re-walking the outline at query time.

Expand Down
51 changes: 30 additions & 21 deletions src/logseq_matryca_parser/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,22 +145,28 @@ def _strip_markdown_for_embedding(text: str) -> str:
return s


def _expand_macros_and_embeds(text: str, graph: LogseqGraph, visited_uuids: set[str]) -> str:
def _expand_macros_and_embeds(
text: str,
graph: LogseqGraph,
visited_uuids: set[str],
*,
embed_page_chain: frozenset[str] = frozenset(),
) -> str:
"""Expand ``{{embed ((uuid))}}`` / ``{{embed [[page]]}}`` for RAG text.

Operates on raw block ``content`` (not ``clean_text``) so ``((uuid))`` inside macros is
still visible to the scanner after parsing.
"""
return _expand_macros_and_embeds_impl(text, graph, visited_uuids, set())
return _expand_macros_and_embeds_impl(text, graph, visited_uuids, embed_page_chain)


def _expand_macros_and_embeds_impl(
text: str,
graph: LogseqGraph,
visited_uuids: set[str],
visited_pages: set[str],
embed_page_chain: frozenset[str],
) -> str:
"""Shared worker: ``visited_uuids`` breaks block cycles; ``visited_pages`` breaks page cycles."""
"""Shared worker: ``visited_uuids`` breaks block cycles; ``embed_page_chain`` breaks page cycles."""
result = text
while True:
bm = _BLOCK_EMBED_PATTERN.search(result)
Expand All @@ -185,7 +191,7 @@ def _expand_macros_and_embeds_impl(
next_seen = set(visited_uuids)
next_seen.add(uid)
replacement = _expand_macros_and_embeds_impl(
target.content, graph, next_seen, visited_pages
target.content, graph, next_seen, embed_page_chain
)
result = result[: match.start()] + replacement + result[match.end() :]
else:
Expand All @@ -195,27 +201,24 @@ def _expand_macros_and_embeds_impl(
title = match.group("title").strip()
page = graph.get_page(title)
canonical_title = page.title if page is not None else title
if canonical_title in visited_pages:
if canonical_title in embed_page_chain:
logger.debug("Stack-Machine embed: cyclic page title=%s", canonical_title)
replacement = ""
elif page is None:
logger.debug("Stack-Machine embed: unknown page title=%s", title)
replacement = ""
else:
visited_pages.add(canonical_title)
try:
shared_blocks = set(visited_uuids)
pieces: list[str] = []
for n in _flatten_nodes_for_export(page.root_nodes):
frag = _expand_macros_and_embeds_impl(
n.content, graph, shared_blocks, visited_pages
)
stripped = frag.strip()
if stripped:
pieces.append(stripped)
replacement = "\n".join(pieces)
finally:
visited_pages.discard(canonical_title)
next_chain = embed_page_chain | frozenset({canonical_title})
shared_blocks = set(visited_uuids)
pieces: list[str] = []
for n in _flatten_nodes_for_export(page.root_nodes):
frag = _expand_macros_and_embeds_impl(
n.content, graph, shared_blocks, next_chain
)
stripped = frag.strip()
if stripped:
pieces.append(stripped)
replacement = "\n".join(pieces)
result = result[: match.start()] + replacement + result[match.end() :]
return result

Expand Down Expand Up @@ -404,7 +407,13 @@ def to_context_enriched_chunks(
continue
breadcrumbs, page = _build_breadcrumbs(graph, node)
source_name = Path(node.source_path).name if node.source_path else str(graph.graph_path.name)
expanded_content = _expand_macros_and_embeds(node.content, graph, set())
host_page = graph.page_for_node(node)
embed_chain = (
frozenset({host_page.title}) if host_page is not None else frozenset()
)
expanded_content = _expand_macros_and_embeds(
node.content, graph, set(), embed_page_chain=embed_chain
)
page_content = format_template.format(
breadcrumbs=breadcrumbs,
content=expanded_content,
Expand Down
19 changes: 19 additions & 0 deletions tests/test_synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,25 @@ def test_expand_page_embed_resolves_case_insensitive_title(tmp_path: Path) -> No
assert "shared body" in expanded


def test_expand_cyclic_page_embed_does_not_duplicate_parent_text(tmp_path: Path) -> None:
"""A embeds B embeds A must not re-inline parent literal text (#65)."""
from logseq_matryca_parser.synapse import _expand_macros_and_embeds

graph_root = tmp_path / "vault"
pages = graph_root / "pages"
pages.mkdir(parents=True)
(pages / "A.md").write_text("- before {{embed [[B]]}} after\n", encoding="utf-8")
(pages / "B.md").write_text("- inner {{embed [[A]]}}\n", encoding="utf-8")
graph = LogseqGraph.load_directory(graph_root)
host_page = graph.pages["A"]
text = host_page.root_nodes[0].content
chain = frozenset({host_page.title})

expanded = _expand_macros_and_embeds(text, graph, set(), embed_page_chain=chain)

assert expanded.strip() == "before inner after"


def test_expand_missing_block_embed_completes_without_hang(tmp_path: Path) -> None:
from logseq_matryca_parser.synapse import _expand_macros_and_embeds

Expand Down