From 9aa430a88d7216c70d1c991d5365c85d89b742ba Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 16:59:30 +0800 Subject: [PATCH 01/16] feat(config): add ScopeConfig and DefaultScopeConfig dataclasses --- src/memsearch/config.py | 20 ++++++++++++++++++++ tests/test_config.py | 19 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/src/memsearch/config.py b/src/memsearch/config.py index 52c9b8c1..e394cb6e 100644 --- a/src/memsearch/config.py +++ b/src/memsearch/config.py @@ -70,6 +70,26 @@ class RerankerConfig: model: str = "" # empty = disabled; set to model ID to enable +@dataclass +class ScopeConfig: + """One additional memory scope. See [[scopes]] in TOML.""" + + name: str = "" + collection: str = "" + paths: list[str] = field(default_factory=list) + quota: int | None = None + uri: str = "" # empty = inherit [milvus].uri + token: str = "" # empty = inherit [milvus].token + + +@dataclass +class DefaultScopeConfig: + """Tunable settings for the default (single-collection) scope.""" + + name: str = "project" + quota: int | None = None + + @dataclass class LLMConfig: """LLM settings for plugin summarization and compact. diff --git a/tests/test_config.py b/tests/test_config.py index 4a8bcdf8..a47fc6d3 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -364,3 +364,22 @@ def test_dict_to_config_accepts_empty_section_dicts() -> None: assert cfg.embedding.provider == "openai" assert cfg.milvus.collection == "memsearch_chunks" assert cfg.watch.debounce_ms == 1500 + + +def test_scope_config_defaults(): + """ScopeConfig should have sensible defaults.""" + from memsearch.config import ScopeConfig + sc = ScopeConfig(name="x", collection="c") + assert sc.name == "x" + assert sc.collection == "c" + assert sc.paths == [] + assert sc.quota is None + assert sc.uri == "" + assert sc.token == "" + + +def test_default_scope_config_defaults(): + from memsearch.config import DefaultScopeConfig + ds = DefaultScopeConfig() + assert ds.name == "project" + assert ds.quota is None From 5f750ee5250fc9a1f33b9dfe84026f5c74062a03 Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 17:00:04 +0800 Subject: [PATCH 02/16] style: apply ruff format to config.py and test_config.py --- src/memsearch/config.py | 2 +- tests/test_config.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/memsearch/config.py b/src/memsearch/config.py index e394cb6e..959471c7 100644 --- a/src/memsearch/config.py +++ b/src/memsearch/config.py @@ -78,7 +78,7 @@ class ScopeConfig: collection: str = "" paths: list[str] = field(default_factory=list) quota: int | None = None - uri: str = "" # empty = inherit [milvus].uri + uri: str = "" # empty = inherit [milvus].uri token: str = "" # empty = inherit [milvus].token diff --git a/tests/test_config.py b/tests/test_config.py index a47fc6d3..49b48c38 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -369,6 +369,7 @@ def test_dict_to_config_accepts_empty_section_dicts() -> None: def test_scope_config_defaults(): """ScopeConfig should have sensible defaults.""" from memsearch.config import ScopeConfig + sc = ScopeConfig(name="x", collection="c") assert sc.name == "x" assert sc.collection == "c" @@ -380,6 +381,7 @@ def test_scope_config_defaults(): def test_default_scope_config_defaults(): from memsearch.config import DefaultScopeConfig + ds = DefaultScopeConfig() assert ds.name == "project" assert ds.quota is None From d77e3d254c3afedaec9357bedd26872d99719bf0 Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 17:03:14 +0800 Subject: [PATCH 03/16] docs(config): note T2 will register multi-scope dataclasses Add pointer comments above MemSearchConfig and _SECTION_CLASSES to clarify that ScopeConfig/DefaultScopeConfig are intentionally unwired and will be integrated in Task 2 of the multi-scope plan. Co-Authored-By: Claude Sonnet 4.6 --- src/memsearch/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/memsearch/config.py b/src/memsearch/config.py index 959471c7..be616fe8 100644 --- a/src/memsearch/config.py +++ b/src/memsearch/config.py @@ -116,6 +116,7 @@ class PromptsConfig: summarize: str = "" # custom prompt file for plugin session summarization +# NOTE: ScopeConfig/DefaultScopeConfig are wired in by Task 2 (multi-scope plan). @dataclass class MemSearchConfig: milvus: MilvusConfig = field(default_factory=MilvusConfig) @@ -128,6 +129,7 @@ class MemSearchConfig: prompts: PromptsConfig = field(default_factory=PromptsConfig) +# NOTE: ScopeConfig/DefaultScopeConfig are wired in by Task 2 (multi-scope plan). # -- Section name → dataclass mapping for typed reconstruction -- _SECTION_CLASSES: dict[str, type] = { "milvus": MilvusConfig, From dc0344982ca8c58f36eec44f3968b2efba479ca6 Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 17:05:44 +0800 Subject: [PATCH 04/16] feat(config): wire scopes and default_scope into MemSearchConfig --- src/memsearch/config.py | 25 ++++++++++++++++++++++--- tests/test_config.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/memsearch/config.py b/src/memsearch/config.py index be616fe8..91d174b2 100644 --- a/src/memsearch/config.py +++ b/src/memsearch/config.py @@ -116,7 +116,6 @@ class PromptsConfig: summarize: str = "" # custom prompt file for plugin session summarization -# NOTE: ScopeConfig/DefaultScopeConfig are wired in by Task 2 (multi-scope plan). @dataclass class MemSearchConfig: milvus: MilvusConfig = field(default_factory=MilvusConfig) @@ -127,9 +126,10 @@ class MemSearchConfig: reranker: RerankerConfig = field(default_factory=RerankerConfig) llm: LLMConfig = field(default_factory=LLMConfig) prompts: PromptsConfig = field(default_factory=PromptsConfig) + default_scope: DefaultScopeConfig = field(default_factory=DefaultScopeConfig) + scopes: list[ScopeConfig] = field(default_factory=list) -# NOTE: ScopeConfig/DefaultScopeConfig are wired in by Task 2 (multi-scope plan). # -- Section name → dataclass mapping for typed reconstruction -- _SECTION_CLASSES: dict[str, type] = { "milvus": MilvusConfig, @@ -140,6 +140,7 @@ class MemSearchConfig: "reranker": RerankerConfig, "llm": LLMConfig, "prompts": PromptsConfig, + "default_scope": DefaultScopeConfig, } @@ -174,10 +175,16 @@ def resolve_env_ref(value: str) -> str: def _resolve_env_refs_in_dict(d: dict[str, Any]) -> dict[str, Any]: """Walk a nested config dict and resolve all ``env:`` references.""" - resolved = {} + resolved: dict[str, Any] = {} for key, val in d.items(): if isinstance(val, dict): resolved[key] = _resolve_env_refs_in_dict(val) + elif isinstance(val, list): + resolved[key] = [ + _resolve_env_refs_in_dict(item) if isinstance(item, dict) + else (resolve_env_ref(item) if isinstance(item, str) and item.startswith(_ENV_PREFIX) else item) + for item in val + ] elif isinstance(val, str) and val.startswith(_ENV_PREFIX): resolved[key] = resolve_env_ref(val) else: @@ -226,6 +233,18 @@ def _dict_to_config(d: dict[str, Any]) -> MemSearchConfig: valid = {f.name for f in fields(cls)} filtered = {k: v for k, v in section_data.items() if k in valid} kwargs[section_name] = cls(**filtered) + + scopes_raw = d.get("scopes", []) + scopes: list[ScopeConfig] = [] + if isinstance(scopes_raw, list): + valid_scope_keys = {f.name for f in fields(ScopeConfig)} + for entry in scopes_raw: + if not isinstance(entry, dict): + continue + filtered = {k: v for k, v in entry.items() if k in valid_scope_keys} + scopes.append(ScopeConfig(**filtered)) + kwargs["scopes"] = scopes + return MemSearchConfig(**kwargs) diff --git a/tests/test_config.py b/tests/test_config.py index 49b48c38..fef01d73 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -385,3 +385,34 @@ def test_default_scope_config_defaults(): ds = DefaultScopeConfig() assert ds.name == "project" assert ds.quota is None + + +def test_memsearch_config_has_scopes_and_default_scope(): + cfg = MemSearchConfig() + assert cfg.scopes == [] + assert cfg.default_scope.name == "project" + + +def test_resolve_config_loads_scopes_array(tmp_path, monkeypatch): + """[[scopes]] array-of-tables should round-trip into MemSearchConfig.scopes.""" + import memsearch.config as cfg_mod + proj = tmp_path / ".memsearch.toml" + proj.write_text( + '[default_scope]\nname = "myproj"\nquota = 5\n\n' + '[[scopes]]\nname = "global"\ncollection = "ms_global"\n' + 'paths = ["/tmp/g"]\nquota = 3\n\n' + '[[scopes]]\nname = "personal"\ncollection = "ms_personal"\n' + 'paths = ["/tmp/p"]\n' + ) + monkeypatch.setattr(cfg_mod, "PROJECT_CONFIG_PATH", proj) + monkeypatch.setattr(cfg_mod, "GLOBAL_CONFIG_PATH", tmp_path / "global-absent.toml") + cfg = cfg_mod.resolve_config() + assert cfg.default_scope.name == "myproj" + assert cfg.default_scope.quota == 5 + assert len(cfg.scopes) == 2 + assert cfg.scopes[0].name == "global" + assert cfg.scopes[0].collection == "ms_global" + assert cfg.scopes[0].paths == ["/tmp/g"] + assert cfg.scopes[0].quota == 3 + assert cfg.scopes[1].name == "personal" + assert cfg.scopes[1].quota is None From 32409b83c0e186ea71c01d5a0e713f9cbc5debf9 Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 17:06:19 +0800 Subject: [PATCH 05/16] style(config): apply ruff format to Task 2 changes --- src/memsearch/config.py | 3 ++- tests/test_config.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/memsearch/config.py b/src/memsearch/config.py index 91d174b2..fb7235b5 100644 --- a/src/memsearch/config.py +++ b/src/memsearch/config.py @@ -181,7 +181,8 @@ def _resolve_env_refs_in_dict(d: dict[str, Any]) -> dict[str, Any]: resolved[key] = _resolve_env_refs_in_dict(val) elif isinstance(val, list): resolved[key] = [ - _resolve_env_refs_in_dict(item) if isinstance(item, dict) + _resolve_env_refs_in_dict(item) + if isinstance(item, dict) else (resolve_env_ref(item) if isinstance(item, str) and item.startswith(_ENV_PREFIX) else item) for item in val ] diff --git a/tests/test_config.py b/tests/test_config.py index fef01d73..7e953138 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -396,6 +396,7 @@ def test_memsearch_config_has_scopes_and_default_scope(): def test_resolve_config_loads_scopes_array(tmp_path, monkeypatch): """[[scopes]] array-of-tables should round-trip into MemSearchConfig.scopes.""" import memsearch.config as cfg_mod + proj = tmp_path / ".memsearch.toml" proj.write_text( '[default_scope]\nname = "myproj"\nquota = 5\n\n' From 6876b35b5dd597e1ade857e237b314b4c4b0d7cc Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 17:13:47 +0800 Subject: [PATCH 06/16] feat(config): validate scope paths don't overlap --- src/memsearch/config.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_config.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/src/memsearch/config.py b/src/memsearch/config.py index fb7235b5..370c48ff 100644 --- a/src/memsearch/config.py +++ b/src/memsearch/config.py @@ -156,6 +156,39 @@ class ConfigEnvVarError(KeyError): """ +class ScopePathOverlapError(ValueError): + """Raised when two scopes have overlapping paths.""" + + +def validate_scope_paths(scopes: list[ScopeConfig]) -> None: + """Raise ScopePathOverlapError if any two scopes' paths overlap. + + Path A overlaps path B if A is a parent of B (or vice versa). Read-only + scopes (empty paths) cannot conflict. + """ + resolved: list[tuple[str, Path]] = [] + for sc in scopes: + resolved.extend((sc.name, Path(p).expanduser().resolve()) for p in sc.paths) + for i, (name_a, path_a) in enumerate(resolved): + for name_b, path_b in resolved[i + 1 :]: + if name_a == name_b: + continue + if _is_parent_or_equal(path_a, path_b) or _is_parent_or_equal(path_b, path_a): + raise ScopePathOverlapError( + f"Scope paths overlap: scope {name_a!r} path {path_a} conflicts with scope {name_b!r} path {path_b}" + ) + + +def _is_parent_or_equal(parent: Path, child: Path) -> bool: + if parent == child: + return True + try: + child.relative_to(parent) + return True + except ValueError: + return False + + def resolve_env_ref(value: str) -> str: """Resolve an ``env:VAR_NAME`` reference to its environment variable value. @@ -288,6 +321,8 @@ def resolve_config(cli_overrides: dict[str, Any] | None = None) -> MemSearchConf cfg.embedding.model = DEFAULT_MODELS.get(cfg.embedding.provider, "") + validate_scope_paths(cfg.scopes) + return cfg diff --git a/tests/test_config.py b/tests/test_config.py index 7e953138..a5596698 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -417,3 +417,38 @@ def test_resolve_config_loads_scopes_array(tmp_path, monkeypatch): assert cfg.scopes[0].quota == 3 assert cfg.scopes[1].name == "personal" assert cfg.scopes[1].quota is None + + +def test_validate_scope_paths_rejects_overlap(tmp_path): + from memsearch.config import ScopeConfig, ScopePathOverlapError, validate_scope_paths + + a = tmp_path / "shared" + a.mkdir() + scopes = [ + ScopeConfig(name="a", collection="ca", paths=[str(a)]), + ScopeConfig(name="b", collection="cb", paths=[str(a / "sub")]), + ] + with pytest.raises(ScopePathOverlapError) as exc: + validate_scope_paths(scopes) + assert "a" in str(exc.value) and "b" in str(exc.value) + + +def test_validate_scope_paths_allows_disjoint(tmp_path): + from memsearch.config import ScopeConfig, validate_scope_paths + + scopes = [ + ScopeConfig(name="a", collection="ca", paths=[str(tmp_path / "a")]), + ScopeConfig(name="b", collection="cb", paths=[str(tmp_path / "b")]), + ] + validate_scope_paths(scopes) # must not raise + + +def test_validate_scope_paths_skips_empty_paths(): + """Read-only scopes (no paths) cannot conflict with anything.""" + from memsearch.config import ScopeConfig, validate_scope_paths + + scopes = [ + ScopeConfig(name="a", collection="ca", paths=["/tmp/foo"]), + ScopeConfig(name="b", collection="cb", paths=[]), # read-only + ] + validate_scope_paths(scopes) From 2671b02f0b522807471ae9e5f6247f93d62422b5 Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 17:18:15 +0800 Subject: [PATCH 07/16] feat(core): add Scope dataclass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add frozen `Scope` dataclass with name, collection, paths, quota, uri, and token fields — first building block for multi-scope blended search. --- src/memsearch/core.py | 18 ++++++++++++++++++ tests/test_core_unit.py | 15 +++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 tests/test_core_unit.py diff --git a/src/memsearch/core.py b/src/memsearch/core.py index b3eabb3b..447042d2 100644 --- a/src/memsearch/core.py +++ b/src/memsearch/core.py @@ -5,6 +5,7 @@ import asyncio import logging from collections.abc import Callable +from dataclasses import dataclass, field from datetime import date from pathlib import Path from typing import TYPE_CHECKING, Any @@ -21,6 +22,23 @@ logger = logging.getLogger(__name__) +@dataclass(frozen=True) +class Scope: + """One memory scope. See spec for full semantics. + + A scope with empty ``paths`` is read-only (search-only, never indexed). + ``quota=None`` means "share remaining slots with other unquota'd scopes". + ``uri``/``token`` of ``None`` means inherit from the parent ``MemSearch``. + """ + + name: str + collection: str + paths: list[str] = field(default_factory=list) + quota: int | None = None + uri: str | None = None + token: str | None = None + + class MemSearch: """High-level API for semantic memory search. diff --git a/tests/test_core_unit.py b/tests/test_core_unit.py new file mode 100644 index 00000000..5c0a34bf --- /dev/null +++ b/tests/test_core_unit.py @@ -0,0 +1,15 @@ +"""Unit tests for core helpers that don't require Milvus or an embedder.""" + +from __future__ import annotations + + +def test_scope_dataclass_defaults(): + from memsearch.core import Scope + + s = Scope(name="x", collection="c") + assert s.name == "x" + assert s.collection == "c" + assert s.paths == [] + assert s.quota is None + assert s.uri is None + assert s.token is None From 1edea02e7515bdbedbd7b3adf33b4013599028eb Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 17:21:45 +0800 Subject: [PATCH 08/16] feat(core): pure dedup+quota helper for multi-scope blending Co-Authored-By: Claude Sonnet 4.6 --- src/memsearch/core.py | 68 ++++++++++++++++++++++++ tests/test_core_unit.py | 113 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+) diff --git a/src/memsearch/core.py b/src/memsearch/core.py index 447042d2..6f2067ad 100644 --- a/src/memsearch/core.py +++ b/src/memsearch/core.py @@ -39,6 +39,74 @@ class Scope: token: str | None = None +def _blend_scope_results( + per_scope: list[tuple[str, list[dict]]], + scope_quotas: dict[str, int | None], + default_scope_name: str, + scope_order: list[str], + top_k: int, +) -> list[dict]: + """Dedup, apply per-scope quotas, return top-K blended. + + Algorithm: + 1. Tag each hit with its scope name. + 2. Dedup by chunk_hash; keep highest-scoring; remember winning scope. + 3. Quota modes: + - all scopes have quotas → hard cap per scope, no redistribution + - no scopes have quotas → return globally top-K by score + - mixed → quota'd capped first; unquota'd share remainder by score + 4. Tie-break: default scope wins, then ``scope_order`` index. + """ + # 1+2. Tag & dedup + seen: dict[str, dict] = {} + for scope_name, hits in per_scope: + for h in hits: + key = h["chunk_hash"] + tagged = {**h, "scope": scope_name} + existing = seen.get(key) + if existing is None or tagged["score"] > existing["score"]: + seen[key] = tagged + + scope_rank = {name: i for i, name in enumerate(scope_order)} + + def sort_key(r: dict) -> tuple: + # Higher score first; then default scope wins; then config order + return ( + -r["score"], + 0 if r["scope"] == default_scope_name else 1, + scope_rank.get(r["scope"], len(scope_order)), + ) + + all_hits = sorted(seen.values(), key=sort_key) + + # 3. Quota modes + quotas_present = [v for v in scope_quotas.values() if v is not None] + + # All-no-quota: just top-k + if not quotas_present: + return all_hits[:top_k] + + capped: dict[str, list[dict]] = {n: [] for n in scope_quotas} + leftovers: list[dict] = [] + + for h in all_hits: + sc = h["scope"] + q = scope_quotas.get(sc) + if q is None: + leftovers.append(h) + elif len(capped[sc]) < q: + capped[sc].append(h) + # else: quota'd scope full; drop this hit (no redistribution) + + quota_total = sum(scope_quotas[n] or 0 for n in scope_quotas) + remaining_slots = max(0, top_k - quota_total) + chosen_leftovers = leftovers[:remaining_slots] + + merged = [h for hits in capped.values() for h in hits] + chosen_leftovers + merged.sort(key=sort_key) + return merged[:top_k] + + class MemSearch: """High-level API for semantic memory search. diff --git a/tests/test_core_unit.py b/tests/test_core_unit.py index 5c0a34bf..cb4ddb4e 100644 --- a/tests/test_core_unit.py +++ b/tests/test_core_unit.py @@ -13,3 +13,116 @@ def test_scope_dataclass_defaults(): assert s.quota is None assert s.uri is None assert s.token is None + + +def _hit(chunk_hash: str, score: float, content: str = "x", source: str = "/x.md") -> dict: + return {"chunk_hash": chunk_hash, "score": score, "content": content, "source": source} + + +def test_blend_dedups_keeps_higher_score(): + from memsearch.core import _blend_scope_results + + result = _blend_scope_results( + per_scope=[ + ("project", [_hit("a", 0.5), _hit("b", 0.3)]), + ("global", [_hit("a", 0.9)]), # same chunk_hash, higher score + ], + scope_quotas={"project": None, "global": None}, + default_scope_name="project", + scope_order=["project", "global"], + top_k=10, + ) + chunk_a = next(r for r in result if r["chunk_hash"] == "a") + assert chunk_a["score"] == 0.9 + assert chunk_a["scope"] == "global" + + +def test_blend_all_quotas_caps_per_scope(): + from memsearch.core import _blend_scope_results + + result = _blend_scope_results( + per_scope=[ + ("project", [_hit(f"p{i}", 1.0 - i * 0.01) for i in range(10)]), + ("global", [_hit(f"g{i}", 0.5 - i * 0.01) for i in range(10)]), + ], + scope_quotas={"project": 3, "global": 2}, + default_scope_name="project", + scope_order=["project", "global"], + top_k=10, + ) + by_scope = {} + for r in result: + by_scope.setdefault(r["scope"], 0) + by_scope[r["scope"]] += 1 + assert by_scope == {"project": 3, "global": 2} + + +def test_blend_no_quotas_returns_global_top_k(): + from memsearch.core import _blend_scope_results + + result = _blend_scope_results( + per_scope=[ + ("project", [_hit(f"p{i}", 0.5) for i in range(5)]), + ("global", [_hit(f"g{i}", 0.9) for i in range(5)]), + ], + scope_quotas={"project": None, "global": None}, + default_scope_name="project", + scope_order=["project", "global"], + top_k=4, + ) + assert len(result) == 4 + # global has higher scores; all 4 should be from global + assert all(r["scope"] == "global" for r in result) + + +def test_blend_mixed_quotas(): + """Quota'd scopes filled first (cap), unquota'd share remainder by score.""" + from memsearch.core import _blend_scope_results + + result = _blend_scope_results( + per_scope=[ + ("project", [_hit(f"p{i}", 0.95) for i in range(10)]), # high score, no quota + ("global", [_hit(f"g{i}", 0.50) for i in range(10)]), # quota=2 + ], + scope_quotas={"project": None, "global": 2}, + default_scope_name="project", + scope_order=["project", "global"], + top_k=5, + ) + by_scope = {r["scope"] for r in result} + counts = {s: sum(1 for r in result if r["scope"] == s) for s in by_scope} + assert counts == {"project": 3, "global": 2} + + +def test_blend_quota_underfill_does_not_redistribute(): + from memsearch.core import _blend_scope_results + + result = _blend_scope_results( + per_scope=[ + ("project", [_hit(f"p{i}", 0.9) for i in range(10)]), + ("global", [_hit("g0", 0.5)]), # only 1 hit, quota 5 → 4 empty slots + ], + scope_quotas={"project": 3, "global": 5}, + default_scope_name="project", + scope_order=["project", "global"], + top_k=10, + ) + counts = {s: sum(1 for r in result if r["scope"] == s) for s in {"project", "global"}} + assert counts == {"project": 3, "global": 1} # NOT 3 + 5; project still capped + + +def test_blend_tiebreak_default_scope_wins(): + from memsearch.core import _blend_scope_results + + result = _blend_scope_results( + per_scope=[ + ("project", [_hit("p", 0.5)]), + ("global", [_hit("g", 0.5)]), # equal score + ], + scope_quotas={"project": None, "global": None}, + default_scope_name="project", + scope_order=["project", "global"], + top_k=2, + ) + assert result[0]["scope"] == "project" + assert result[1]["scope"] == "global" From 0a79264398634aff2646383148b267889a9b541f Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 17:30:08 +0800 Subject: [PATCH 09/16] feat(core): construct one MilvusStore per scope Add `default_scope_name`, `default_scope_quota`, and `extra_scopes` kwargs to `MemSearch.__init__`; build `self._stores: dict[str, MilvusStore]` with one entry per scope; keep `self._store` as a back-compat alias pointing at the default scope's store. Update `close()` to iterate all stores, with a `__new__`-safe fallback for test fixtures that bypass `__init__`. Co-Authored-By: Claude Sonnet 4.6 --- src/memsearch/core.py | 45 +++++++++++++++++++++++++++++++++-------- tests/test_core_unit.py | 44 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 8 deletions(-) diff --git a/src/memsearch/core.py b/src/memsearch/core.py index 6f2067ad..bcf3c9f4 100644 --- a/src/memsearch/core.py +++ b/src/memsearch/core.py @@ -146,6 +146,9 @@ def __init__( max_chunk_size: int = 1500, overlap_lines: int = 2, reranker_model: str = "", + default_scope_name: str = "project", + default_scope_quota: int | None = None, + extra_scopes: list[Scope] | None = None, ) -> None: self._paths = [str(p) for p in (paths or [])] self._max_chunk_size = max_chunk_size @@ -157,14 +160,35 @@ def __init__( base_url=embedding_base_url, api_key=embedding_api_key, ) - self._store = MilvusStore( - uri=milvus_uri, - token=milvus_token, - collection=collection, - dimension=self._embedder.dimension, - description=description, - ) self._reranker_model = reranker_model + self._default_scope_name = default_scope_name + self._default_scope_quota = default_scope_quota + self._extra_scopes: list[Scope] = list(extra_scopes or []) + + # Default scope's store (uses parent milvus_uri/token + collection kwarg) + self._stores: dict[str, MilvusStore] = { + default_scope_name: MilvusStore( + uri=milvus_uri, + token=milvus_token, + collection=collection, + dimension=self._embedder.dimension, + description=description, + ) + } + # Back-compat alias for code that still references self._store + self._store = self._stores[default_scope_name] + + # Extra scopes: each gets its own store, optionally on a different Milvus + for sc in self._extra_scopes: + if sc.name in self._stores: + raise ValueError(f"Duplicate scope name: {sc.name!r}") + self._stores[sc.name] = MilvusStore( + uri=sc.uri or milvus_uri, + token=sc.token if sc.token is not None else milvus_token, + collection=sc.collection, + dimension=self._embedder.dimension, + description=description, + ) # ------------------------------------------------------------------ # Indexing @@ -490,7 +514,12 @@ def store(self) -> MilvusStore: def close(self) -> None: """Release resources.""" - self._store.close() + stores = getattr(self, "_stores", None) + if stores is not None: + for store in stores.values(): + store.close() + elif (store := getattr(self, "_store", None)) is not None: + store.close() def __enter__(self) -> MemSearch: return self diff --git a/tests/test_core_unit.py b/tests/test_core_unit.py index cb4ddb4e..a83e2a45 100644 --- a/tests/test_core_unit.py +++ b/tests/test_core_unit.py @@ -126,3 +126,47 @@ def test_blend_tiebreak_default_scope_wins(): ) assert result[0]["scope"] == "project" assert result[1]["scope"] == "global" + + +def test_memsearch_default_only_one_store(tmp_path): + """No extra_scopes → exactly one store, named after default_scope_name.""" + from memsearch.core import MemSearch + + m = MemSearch(milvus_uri=str(tmp_path / "x.db"), embedding_provider="openai", embedding_api_key="fake") + try: + assert list(m._stores.keys()) == ["project"] + finally: + m.close() + + +def test_memsearch_extra_scopes_create_per_scope_stores(tmp_path): + from memsearch.core import MemSearch, Scope + + m = MemSearch( + milvus_uri=str(tmp_path / "x.db"), + embedding_provider="openai", + embedding_api_key="fake", + extra_scopes=[ + Scope(name="global", collection="ms_global"), + Scope(name="personal", collection="ms_personal"), + ], + ) + try: + assert set(m._stores.keys()) == {"project", "global", "personal"} + finally: + m.close() + + +def test_memsearch_default_scope_name_override(tmp_path): + from memsearch.core import MemSearch + + m = MemSearch( + milvus_uri=str(tmp_path / "x.db"), + embedding_provider="openai", + embedding_api_key="fake", + default_scope_name="myproj", + ) + try: + assert list(m._stores.keys()) == ["myproj"] + finally: + m.close() From 4f7488c47a8dfd1db5db693a9520e42f840795cd Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 17:36:14 +0800 Subject: [PATCH 10/16] feat(core): multi-scope blended search with fan-out, dedup, quotas Replace MemSearch.search body with single-scope fast path (no scope tag, backwards-compatible) and multi-scope path using asyncio.gather fan-out, _blend_scope_results dedup+quota logic, and only_scope restriction with ValueError on unknown names. Add _seed_scope helper, two_scope_mem fixture, and four integration tests covering: no-scope-field on single-scope, scope tagging on multi-scope, only_scope restriction, and ValueError on unknown scope names. Co-Authored-By: Claude Sonnet 4.6 --- src/memsearch/core.py | 81 ++++++++++++++++++++++++++++++++++----- tests/test_core.py | 89 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 9 deletions(-) diff --git a/src/memsearch/core.py b/src/memsearch/core.py index bcf3c9f4..461db23d 100644 --- a/src/memsearch/core.py +++ b/src/memsearch/core.py @@ -313,8 +313,9 @@ async def search( *, top_k: int = 10, source_prefix: str | Path | None = None, + only_scope: list[str] | None = None, ) -> list[dict[str, Any]]: - """Semantic search across indexed chunks. + """Semantic search across one or more scopes. Parameters ---------- @@ -325,27 +326,89 @@ async def search( source_prefix: Optional path prefix to scope results. Only chunks whose ``source`` starts with this prefix are returned. + In multi-scope mode this filter applies only to the default scope. + only_scope: + If given, restrict the search to the named scope(s). Raises + ``ValueError`` if any name is not a known scope. Returns ------- list[dict] Each dict contains ``content``, ``source``, ``heading``, - ``score``, and other metadata. + ``score``, and other metadata. In multi-scope mode each result + also carries a ``scope`` field. """ - filter_expr = "" + # Single-scope fast path: no extra_scopes → original behavior, no 'scope' tag + if not self._extra_scopes: + filter_expr = "" + if source_prefix is not None: + prefix = str(Path(source_prefix).expanduser().resolve()) + escaped = prefix.replace("\\", "\\\\").replace('"', '\\"') + filter_expr = f'source like "{escaped}%"' + embeddings = await self._embedder.embed([query]) + fetch_k = top_k * 3 if self._reranker_model else top_k + results = self._store.search( + embeddings[0], + query_text=query, + top_k=fetch_k, + filter_expr=filter_expr, + ) + if self._reranker_model and results: + from .reranker import rerank + + results = rerank(query, results, model_name=self._reranker_model, top_k=top_k) + return results + + # Multi-scope path + all_scope_names = list(self._stores.keys()) + if only_scope is not None: + unknown = set(only_scope) - set(all_scope_names) + if unknown: + raise ValueError(f"unknown scope(s) in only_scope: {sorted(unknown)}") + active = [n for n in all_scope_names if n in set(only_scope)] + else: + active = all_scope_names + + # Source-prefix filter only applies to the default scope + default_filter = "" if source_prefix is not None: prefix = str(Path(source_prefix).expanduser().resolve()) escaped = prefix.replace("\\", "\\\\").replace('"', '\\"') - filter_expr = f'source like "{escaped}%"' + default_filter = f'source like "{escaped}%"' embeddings = await self._embedder.embed([query]) - fetch_k = top_k * 3 if self._reranker_model else top_k - results = self._store.search(embeddings[0], query_text=query, top_k=fetch_k, filter_expr=filter_expr) - if self._reranker_model and results: + fetch_k_per = max(top_k * 2, 10) # over-fetch for dedup margin + + async def _fetch(scope_name: str) -> tuple[str, list[dict]]: + store = self._stores[scope_name] + filt = default_filter if scope_name == self._default_scope_name else "" + hits = store.search(embeddings[0], query_text=query, top_k=fetch_k_per, filter_expr=filt) + return scope_name, hits + + per_scope = await asyncio.gather(*[_fetch(n) for n in active]) + + # Build quota map + scope_quotas: dict[str, int | None] = {} + for sc in self._extra_scopes: + if sc.name in active: + scope_quotas[sc.name] = sc.quota + if self._default_scope_name in active: + scope_quotas[self._default_scope_name] = self._default_scope_quota + + scope_order = [self._default_scope_name] + [s.name for s in self._extra_scopes] + merged = _blend_scope_results( + per_scope=list(per_scope), + scope_quotas=scope_quotas, + default_scope_name=self._default_scope_name, + scope_order=scope_order, + top_k=top_k, + ) + + if self._reranker_model and merged: from .reranker import rerank - results = rerank(query, results, model_name=self._reranker_model, top_k=top_k) - return results + merged = rerank(query, merged, model_name=self._reranker_model, top_k=top_k) + return merged # ------------------------------------------------------------------ # Compact (compress memories) diff --git a/tests/test_core.py b/tests/test_core.py index 9db540ba..e26e8ece 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -66,3 +66,92 @@ async def test_index_single_file(mem, sample_dir: Path): results = await mem.search("list comprehension") assert len(results) > 0 + + +# --------------------------------------------------------------------------- +# T7: multi-scope blended search tests +# --------------------------------------------------------------------------- + +from memsearch.chunker import chunk_markdown, compute_chunk_id # noqa: E402 + + +async def _seed_scope(mem, store_name: str, file_path, content: str): + """Write content to file_path and upsert its chunks into mem._stores[store_name].""" + file_path.write_text(content) + text = file_path.read_text() + chunks = chunk_markdown(text, source=str(file_path), max_chunk_size=1500, overlap_lines=2) + if not chunks: + return + embeddings = await mem._embedder.embed([c.content for c in chunks]) + model = mem._embedder.model_name + rows = [ + { + "chunk_hash": compute_chunk_id(c.source, c.start_line, c.end_line, c.content_hash, model), + "content": c.content, + "source": c.source, + "heading": c.heading, + "heading_level": c.heading_level, + "start_line": c.start_line, + "end_line": c.end_line, + "embedding": e, + } + for c, e in zip(chunks, embeddings, strict=True) + ] + mem._stores[store_name].upsert(rows) + + +@pytest.fixture +def two_scope_mem(tmp_path): + from memsearch.core import MemSearch, Scope + + m = MemSearch( + milvus_uri=str(tmp_path / "x.db"), + collection="ms_proj", + extra_scopes=[Scope(name="global", collection="ms_global", quota=2)], + default_scope_quota=2, + ) + yield m + m.close() + + +@pytest.mark.asyncio +async def test_search_single_scope_no_scope_field(mem, sample_dir): + """Single-scope MemSearch must NOT add a 'scope' field to results.""" + mem._paths = [str(sample_dir)] + await mem.index() + results = await mem.search("python", top_k=2) + assert results + assert "scope" not in results[0] + + +@pytest.mark.asyncio +async def test_search_multi_scope_tags_results(two_scope_mem, tmp_path): + """Multi-scope: results carry 'scope' field; both scopes surface.""" + proj_dir = tmp_path / "proj" + glob_dir = tmp_path / "glob" + proj_dir.mkdir() + glob_dir.mkdir() + await _seed_scope(two_scope_mem, "project", proj_dir / "p.md", "# Project\n\nDeploy via uv.\n") + await _seed_scope(two_scope_mem, "global", glob_dir / "g.md", "# Global\n\nUse 4-space indents.\n") + + results = await two_scope_mem.search("how do I deploy", top_k=4) + scopes_seen = {r["scope"] for r in results} + assert "project" in scopes_seen + assert "scope" in results[0] + + +@pytest.mark.asyncio +async def test_search_only_scope_restriction(two_scope_mem, tmp_path): + """only_scope=['project'] must exclude 'global'.""" + await _seed_scope(two_scope_mem, "project", tmp_path / "p.md", "# P\n\nFoo bar baz.\n") + await _seed_scope(two_scope_mem, "global", tmp_path / "g.md", "# G\n\nFoo bar baz.\n") + + results = await two_scope_mem.search("foo", top_k=4, only_scope=["project"]) + assert results + assert all(r["scope"] == "project" for r in results) + + +@pytest.mark.asyncio +async def test_search_only_scope_unknown_raises(two_scope_mem): + with pytest.raises(ValueError, match="unknown scope"): + await two_scope_mem.search("foo", top_k=4, only_scope=["nope"]) From 6149eb3f1db88a73e98be2af69aa653f2b35e964 Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 17:45:52 +0800 Subject: [PATCH 11/16] feat(core): route index() by per-scope paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MemSearch.index() now builds a plan from the default scope's _paths plus any extra_scopes with non-empty paths. Each file is indexed into the per-scope store via _index_file(scope_name=…). Read-only scopes (empty paths) are skipped entirely. _embed_and_store() also accepts an optional scope_name so it writes to the correct store. Backward-compat is preserved: objects constructed via __new__ without _default_scope_name / _stores fall back to the old _store attr; when scope_name is None the helpers use self._store as before. Co-Authored-By: Claude Sonnet 4.6 --- src/memsearch/core.py | 101 ++++++++++++++++++++---------------------- tests/test_core.py | 56 +++++++++++++++++++++++ 2 files changed, 104 insertions(+), 53 deletions(-) diff --git a/src/memsearch/core.py b/src/memsearch/core.py index 461db23d..9b6774d7 100644 --- a/src/memsearch/core.py +++ b/src/memsearch/core.py @@ -195,35 +195,48 @@ def __init__( # ------------------------------------------------------------------ async def index(self, *, force: bool = False) -> int: - """Scan paths and index all markdown files. + """Scan all scopes' paths and index files into the per-scope stores. - Returns the number of chunks indexed. Also removes chunks for - files that no longer exist on disk (deleted-file cleanup). + Default scope uses ``self._paths``. Extra scopes with non-empty + ``paths`` are also indexed into their own stores. Read-only scopes + (empty paths) are skipped. + Returns total chunks indexed across all scopes. """ - files = scan_paths(self._paths) + default_name: str | None = getattr(self, "_default_scope_name", None) + extra_scopes: list[Scope] = getattr(self, "_extra_scopes", []) + stores: dict[str, MilvusStore] = getattr(self, "_stores", {}) + + plan: list[tuple[str | None, list[str]]] = [(default_name, self._paths)] + plan.extend((sc.name, sc.paths) for sc in extra_scopes if sc.paths) + total = 0 - failed = 0 - active_sources: set[str] = set() - for f in files: - active_sources.add(str(f.path)) - try: - n = await self._index_file(f, force=force) - total += n - except Exception: - failed += 1 - logger.exception("Failed to index %s, skipping", f.path) - - # Clean up chunks for files that no longer exist - indexed_sources = self._store.indexed_sources() - for source in indexed_sources: - if source not in active_sources: - self._store.delete_by_source(source) - logger.info("Removed stale chunks for deleted file: %s", source) - - if failed: - logger.warning("Indexed %d chunks from %d files (%d files failed)", total, len(files) - failed, failed) - else: - logger.info("Indexed %d chunks from %d files", total, len(files)) + for scope_name, paths in plan: + if not paths: + continue + files = scan_paths(paths) + failed = 0 + active_sources: set[str] = set() + for f in files: + active_sources.add(str(f.path)) + try: + if scope_name is not None: + n = await self._index_file(f, scope_name=scope_name, force=force) + else: + n = await self._index_file(f, force=force) + total += n + except Exception: + failed += 1 + logger.exception("Failed to index %s, skipping", f.path) + + store = stores.get(scope_name) if scope_name else getattr(self, "_store", None) + if store is not None: + for source in store.indexed_sources(): + if source not in active_sources: + store.delete_by_source(source) + logger.info("[%s] removed stale chunks for deleted file: %s", scope_name, source) + + if failed: + logger.warning("[%s] indexed (%d files failed)", scope_name, failed) return total async def index_file(self, path: str | Path) -> int: @@ -233,7 +246,8 @@ async def index_file(self, path: str | Path) -> int: sf = ScannedFile(path=p, mtime=_st.st_mtime, size=_st.st_size) return await self._index_file(sf) - async def _index_file(self, f: ScannedFile, *, force: bool = False) -> int: + async def _index_file(self, f: ScannedFile, *, scope_name: str | None = None, force: bool = False) -> int: + store = self._stores[scope_name] if scope_name else self._store source = str(f.path) text = f.path.read_text(encoding="utf-8") chunks = chunk_markdown( @@ -243,21 +257,14 @@ async def _index_file(self, f: ScannedFile, *, force: bool = False) -> int: overlap_lines=self._overlap_lines, ) model = self._embedder.model_name - - # Compute composite chunk IDs (matching OpenClaw format) chunk_ids = {compute_chunk_id(c.source, c.start_line, c.end_line, c.content_hash, model) for c in chunks} - old_ids = self._store.hashes_by_source(source) - - # Delete stale chunks that are no longer in the file + old_ids = store.hashes_by_source(source) stale = old_ids - chunk_ids if stale: - self._store.delete_by_hashes(list(stale)) - + store.delete_by_hashes(list(stale)) if not chunks: return 0 - if not force: - # Only embed chunks whose ID doesn't already exist chunks = [ c for c in chunks @@ -265,29 +272,18 @@ async def _index_file(self, f: ScannedFile, *, force: bool = False) -> int: ] if not chunks: return 0 + return await self._embed_and_store(chunks, scope_name=scope_name) - return await self._embed_and_store(chunks) - - async def _embed_and_store(self, chunks: list[Chunk]) -> int: + async def _embed_and_store(self, chunks: list[Chunk], *, scope_name: str | None = None) -> int: if not chunks: return 0 - + store = self._stores[scope_name] if scope_name else self._store model = self._embedder.model_name - # Clean content for embedding: strip HTML comments and metadata noise - # so the embedding vector captures semantics, not UUIDs/paths. - # The original content is preserved in the Milvus record below. contents = [clean_content_for_embedding(c.content) for c in chunks] embeddings = await self._embedder.embed(contents) - records: list[dict[str, Any]] = [] for i, chunk in enumerate(chunks): - chunk_id = compute_chunk_id( - chunk.source, - chunk.start_line, - chunk.end_line, - chunk.content_hash, - model, - ) + chunk_id = compute_chunk_id(chunk.source, chunk.start_line, chunk.end_line, chunk.content_hash, model) records.append( { "chunk_hash": chunk_id, @@ -300,8 +296,7 @@ async def _embed_and_store(self, chunks: list[Chunk]) -> int: "end_line": chunk.end_line, } ) - - return self._store.upsert(records) + return store.upsert(records) # ------------------------------------------------------------------ # Search diff --git a/tests/test_core.py b/tests/test_core.py index e26e8ece..6a63b2f8 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -155,3 +155,59 @@ async def test_search_only_scope_restriction(two_scope_mem, tmp_path): async def test_search_only_scope_unknown_raises(two_scope_mem): with pytest.raises(ValueError, match="unknown scope"): await two_scope_mem.search("foo", top_k=4, only_scope=["nope"]) + + +@pytest.mark.asyncio +async def test_index_routes_files_by_scope_paths(tmp_path): + """Files under scope A's paths land in scope A's store; same for B.""" + from memsearch.core import MemSearch, Scope + + proj_dir = tmp_path / "proj" + glob_dir = tmp_path / "glob" + proj_dir.mkdir() + glob_dir.mkdir() + (proj_dir / "p.md").write_text("# Project\n\nProject-specific note.\n") + (glob_dir / "g.md").write_text("# Global\n\nGlobal preference note.\n") + + m = MemSearch( + milvus_uri=str(tmp_path / "x.db"), + paths=[str(proj_dir)], + collection="ms_proj", + extra_scopes=[Scope(name="global", collection="ms_global", paths=[str(glob_dir)])], + ) + try: + await m.index() + proj_results = m._stores["project"].search([0.0] * m._embedder.dimension, top_k=10) + glob_results = m._stores["global"].search([0.0] * m._embedder.dimension, top_k=10) + proj_sources = {r["source"] for r in proj_results} + glob_sources = {r["source"] for r in glob_results} + assert any("p.md" in s for s in proj_sources) + assert not any("g.md" in s for s in proj_sources) + assert any("g.md" in s for s in glob_sources) + assert not any("p.md" in s for s in glob_sources) + finally: + m.close() + + +@pytest.mark.asyncio +async def test_index_skips_read_only_scope(tmp_path): + """A scope with empty paths must not be touched by index().""" + from memsearch.core import MemSearch, Scope + + proj_dir = tmp_path / "proj" + proj_dir.mkdir() + (proj_dir / "p.md").write_text("# P\n\nx.\n") + m = MemSearch( + milvus_uri=str(tmp_path / "x.db"), + paths=[str(proj_dir)], + collection="ms_proj", + extra_scopes=[Scope(name="readonly", collection="ms_team", paths=[])], + ) + try: + n = await m.index() + # Read-only scope's collection should be empty + ro_results = m._stores["readonly"].search([0.0] * m._embedder.dimension, top_k=10) + assert ro_results == [] + assert n > 0 + finally: + m.close() From c91abb14206c7f1b7dc0edd7edce40ac113c1f98 Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 17:55:50 +0800 Subject: [PATCH 12/16] feat(core): route watcher events to per-scope stores by path prefix Add _resolve_scope_for_path() (longest-prefix match across all scopes) and index_file_for_scope() (scope-aware single-file indexer); update watch() to build a unified path list and route _on_change to the correct store via the resolver instead of hardcoding the default scope. Co-Authored-By: Claude Sonnet 4.6 --- src/memsearch/core.py | 56 ++++++++++++++-- tests/test_watcher_multi_scope.py | 105 ++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 5 deletions(-) create mode 100644 tests/test_watcher_multi_scope.py diff --git a/src/memsearch/core.py b/src/memsearch/core.py index 9b6774d7..9800a7dc 100644 --- a/src/memsearch/core.py +++ b/src/memsearch/core.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +import contextlib import logging from collections.abc import Callable from dataclasses import dataclass, field @@ -246,6 +247,42 @@ async def index_file(self, path: str | Path) -> int: sf = ScannedFile(path=p, mtime=_st.st_mtime, size=_st.st_size) return await self._index_file(sf) + def _resolve_scope_for_path(self, file_path: Path | str) -> str | None: + """Return the scope name whose paths contain ``file_path`` (longest prefix wins). + + Returns None if the path is not under any configured scope. + """ + target = Path(file_path).expanduser().resolve() + + # Build (scope_name, resolved_path) entries for default scope and all extras + candidates: list[tuple[str, Path]] = [ + (self._default_scope_name, Path(p).expanduser().resolve()) for p in self._paths + ] + candidates.extend((sc.name, Path(p).expanduser().resolve()) for sc in self._extra_scopes for p in sc.paths) + + # Find all candidates that are an ancestor of (or equal to) target + matches: list[tuple[str, Path]] = [] + for name, root in candidates: + with contextlib.suppress(ValueError): + target.relative_to(root) + matches.append((name, root)) + + if not matches: + return None + # Longest path wins (most specific) + matches.sort(key=lambda x: len(x[1].parts), reverse=True) + return matches[0][0] + + async def index_file_for_scope(self, path: str | Path, scope_name: str) -> int: + """Index a single file into the named scope's store. + + Returns the number of chunks indexed. + """ + p = Path(path).expanduser().resolve() + st = p.stat() + sf = ScannedFile(path=p, mtime=st.st_mtime, size=st.st_size) + return await self._index_file(sf, scope_name=scope_name) + async def _index_file(self, f: ScannedFile, *, scope_name: str | None = None, force: bool = False) -> int: store = self._stores[scope_name] if scope_name else self._store source = str(f.path) @@ -540,12 +577,17 @@ def watch( def _on_change(event_type: str, file_path: Path) -> None: try: + scope_name = self._resolve_scope_for_path(file_path) + if scope_name is None: + logger.debug("Watcher event for %s ignored: not under any scope", file_path) + return + store = self._stores[scope_name] if event_type == "deleted": - self._store.delete_by_source(str(file_path)) - summary = f"Removed chunks for {file_path}" + store.delete_by_source(str(file_path)) + summary = f"[{scope_name}] removed chunks for {file_path}" else: - n = loop.run_until_complete(self.index_file(file_path)) - summary = f"Indexed {n} chunks from {file_path}" + n = loop.run_until_complete(self.index_file_for_scope(file_path, scope_name)) + summary = f"[{scope_name}] indexed {n} chunks from {file_path}" logger.info(summary) if on_event is not None: on_event(event_type, summary, file_path) @@ -558,7 +600,11 @@ def _on_change(event_type: str, file_path: Path) -> None: fw_kwargs: dict[str, Any] = {} if debounce_ms is not None: fw_kwargs["debounce_ms"] = debounce_ms - watcher = FileWatcher(self._paths, _on_change, **fw_kwargs) + # Watch all scopes' paths, not just the default scope + all_paths: list[str] = list(self._paths) + for sc in self._extra_scopes: + all_paths.extend(sc.paths) + watcher = FileWatcher(all_paths, _on_change, **fw_kwargs) watcher.start() return watcher diff --git a/tests/test_watcher_multi_scope.py b/tests/test_watcher_multi_scope.py new file mode 100644 index 00000000..28c08741 --- /dev/null +++ b/tests/test_watcher_multi_scope.py @@ -0,0 +1,105 @@ +"""Watcher routes file events to the correct scope's store via path-prefix match.""" + +from __future__ import annotations + +import os + +import pytest + +_needs_openai = pytest.mark.skipif( + not os.environ.get("OPENAI_API_KEY"), + reason="OPENAI_API_KEY not set", +) + + +@_needs_openai +def test_resolve_scope_for_path_matches_longest_prefix(tmp_path): + """The path-to-scope router picks the longest matching prefix.""" + from memsearch.core import MemSearch, Scope + + proj = tmp_path / "proj" + glob = tmp_path / "glob" + proj.mkdir() + glob.mkdir() + nested = proj / "nested" + nested.mkdir() + + m = MemSearch( + milvus_uri=str(tmp_path / "x.db"), + paths=[str(proj)], + collection="ms_proj", + extra_scopes=[Scope(name="global", collection="ms_global", paths=[str(glob)])], + ) + try: + # File under proj/ resolves to "project" + assert m._resolve_scope_for_path(proj / "p.md") == "project" + # File under proj/nested/ also resolves to "project" (prefix match still works) + assert m._resolve_scope_for_path(nested / "deep.md") == "project" + # File under glob/ resolves to "global" + assert m._resolve_scope_for_path(glob / "g.md") == "global" + # File outside any scope's paths → returns None + outside = tmp_path / "outside.md" + assert m._resolve_scope_for_path(outside) is None + finally: + m.close() + + +def test_resolve_scope_longest_prefix_wins(tmp_path): + """If two scopes' paths nest (e.g., one inside another), longest prefix wins.""" + # NOTE: validate_scope_paths normally rejects overlap; we test the resolver + # directly here, bypassing the validator, because nested paths CAN occur + # programmatically (e.g., a path passed to FileWatcher that happens to be + # under two registered roots). + from memsearch.core import MemSearch, Scope + + parent = tmp_path / "parent" + child = parent / "child" + child.mkdir(parents=True) + + m = MemSearch.__new__(MemSearch) # bypass __init__ to skip validate_scope_paths + m._default_scope_name = "outer" + m._paths = [str(parent)] + m._extra_scopes = [Scope(name="inner", collection="x", paths=[str(child)])] + + file_in_child = child / "deep.md" + # Longest-prefix match: file is under both parent and child; + # child is the longer prefix so "inner" wins. + assert m._resolve_scope_for_path(file_in_child) == "inner" + file_in_parent_only = parent / "shallow.md" + assert m._resolve_scope_for_path(file_in_parent_only) == "outer" + + +@_needs_openai +@pytest.mark.asyncio +async def test_watch_routes_modify_event_to_correct_scope(tmp_path): + """A modify event for a file under a scope's paths upserts into that scope's store.""" + from memsearch.core import MemSearch, Scope + + proj = tmp_path / "proj" + glob = tmp_path / "glob" + proj.mkdir() + glob.mkdir() + (proj / "p.md").write_text("# P\n\nProject content.\n") + (glob / "g.md").write_text("# G\n\nGlobal content.\n") + + m = MemSearch( + milvus_uri=str(tmp_path / "x.db"), + paths=[str(proj)], + collection="ms_proj", + extra_scopes=[Scope(name="global", collection="ms_global", paths=[str(glob)])], + ) + try: + # Simulate what _on_change does: route + index_file_for_scope + n_proj = await m.index_file_for_scope(proj / "p.md", scope_name="project") + n_glob = await m.index_file_for_scope(glob / "g.md", scope_name="global") + assert n_proj > 0 + assert n_glob > 0 + # Verify routing: project store has p.md, NOT g.md; global store has g.md, NOT p.md + proj_results = m._stores["project"].search([0.0] * m._embedder.dimension, top_k=10) + glob_results = m._stores["global"].search([0.0] * m._embedder.dimension, top_k=10) + assert any("p.md" in r["source"] for r in proj_results) + assert not any("g.md" in r["source"] for r in proj_results) + assert any("g.md" in r["source"] for r in glob_results) + assert not any("p.md" in r["source"] for r in glob_results) + finally: + m.close() From 7fcc9b1eafe4a2578bb91ac6de4bf7b19d5d94db Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 18:02:58 +0800 Subject: [PATCH 13/16] feat(cli): --extra-scope and --only-scope flags on search Add _parse_extra_scope helper, two new Click options on the search command, and wire extra_scopes/only_scope through to MemSearch.search(). Co-Authored-By: Claude Sonnet 4.6 --- src/memsearch/cli.py | 37 ++++++++++++++++++++++++++++++-- tests/test_cli_error_handling.py | 12 +++++++++++ tests/test_cli_help.py | 7 ++++++ 3 files changed, 54 insertions(+), 2 deletions(-) diff --git a/src/memsearch/cli.py b/src/memsearch/cli.py index 655e78a6..98ea7cc1 100644 --- a/src/memsearch/cli.py +++ b/src/memsearch/cli.py @@ -96,6 +96,23 @@ def _cfg_to_memsearch_kwargs(cfg: MemSearchConfig) -> dict: } +def _parse_extra_scope(value: str): + """Parse 'name:collection[:quota]' into a Scope.""" + from .core import Scope + + parts = value.split(":") + if len(parts) < 2 or len(parts) > 3: + raise click.BadParameter(f"Invalid --extra-scope format: {value!r}. Expected 'name:collection[:quota]'.") + name, collection = parts[0], parts[1] + quota: int | None = None + if len(parts) == 3: + try: + quota = int(parts[2]) + except ValueError: + raise click.BadParameter(f"Invalid quota in --extra-scope: {parts[2]!r}. Must be an integer.") from None + return Scope(name=name, collection=collection, quota=quota) + + def _normalize_compact_source(source: str | None) -> str | None: """Normalize compact --source paths to the absolute form used at index time. @@ -197,6 +214,18 @@ def index( @_common_options @click.option("--reranker-model", default=None, help="Cross-encoder model for reranking (empty string disables).") @click.option("--json-output", "-j", is_flag=True, help="Output as JSON.") +@click.option( + "--extra-scope", + "extra_scope", + multiple=True, + help="Add a search scope: name:collection[:quota]. Repeatable.", +) +@click.option( + "--only-scope", + "only_scope", + default=None, + help="Comma-separated scope names to restrict the search to.", +) def search( query: str, top_k: int | None, @@ -211,6 +240,8 @@ def search( milvus_token: str | None, reranker_model: str | None, json_output: bool, + extra_scope: tuple[str, ...], + only_scope: str | None, ) -> None: """Search indexed memory for QUERY.""" from .core import MemSearch @@ -228,10 +259,12 @@ def search( reranker_model=reranker_model, ) ) + extra_scopes = [_parse_extra_scope(v) for v in extra_scope] + only_scope_list = [s.strip() for s in only_scope.split(",") if s.strip()] if only_scope else None ms = None try: - ms = MemSearch(**_cfg_to_memsearch_kwargs(cfg)) - results = _run(ms.search(query, top_k=top_k or 5, source_prefix=source_prefix)) + ms = MemSearch(**_cfg_to_memsearch_kwargs(cfg), extra_scopes=extra_scopes) + results = _run(ms.search(query, top_k=top_k or 5, source_prefix=source_prefix, only_scope=only_scope_list)) if json_output: click.echo(json.dumps(results, indent=2, ensure_ascii=False)) else: diff --git a/tests/test_cli_error_handling.py b/tests/test_cli_error_handling.py index 273b8ce3..2a29c1a7 100644 --- a/tests/test_cli_error_handling.py +++ b/tests/test_cli_error_handling.py @@ -77,3 +77,15 @@ def fake_load(_path): assert result.exit_code == 1 assert "Configuration error:" in result.stderr assert "DEFINITELY_NOT_SET_MEMSEARCH_API_KEY" in result.stderr + + +def test_search_extra_scope_malformed_raises(): + # Single token (no colon) is invalid + result = CliRunner().invoke(cli, ["search", "foo", "--extra-scope", "badformat"]) + assert result.exit_code != 0 + assert "extra-scope" in result.output.lower() or "format" in result.output.lower() + + +def test_search_extra_scope_quota_not_int_raises(): + result = CliRunner().invoke(cli, ["search", "foo", "--extra-scope", "g:c:notanint"]) + assert result.exit_code != 0 diff --git a/tests/test_cli_help.py b/tests/test_cli_help.py index bf2e9783..4281b5b9 100644 --- a/tests/test_cli_help.py +++ b/tests/test_cli_help.py @@ -43,3 +43,10 @@ def test_chunk_size_flag_appears_in_help(args: list[str]) -> None: assert result.exit_code == 0 assert "--max-chunk-size" in result.output + + +def test_search_help_mentions_extra_scope(): + result = CliRunner().invoke(cli, ["search", "--help"]) + assert result.exit_code == 0 + assert "--extra-scope" in result.output + assert "--only-scope" in result.output From 451e6d4fd18be6de455a1ea0c6e3931302219519 Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 18:06:05 +0800 Subject: [PATCH 14/16] feat(cli): include scope in search text output when multi-scope --- src/memsearch/cli.py | 7 ++++++- tests/test_cli_help.py | 26 ++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/memsearch/cli.py b/src/memsearch/cli.py index 98ea7cc1..a79d29e2 100644 --- a/src/memsearch/cli.py +++ b/src/memsearch/cli.py @@ -276,7 +276,12 @@ def search( source = r.get("source", "?") heading = r.get("heading", "") content = r.get("content", "") - click.echo(f"\n--- Result {i} (score: {score:.4f}) ---") + scope = r.get("scope") + header = f"\n--- Result {i} (score: {score:.4f}" + if scope: + header += f", scope: {scope}" + header += ") ---" + click.echo(header) click.echo(f"Source: {source}") if heading: click.echo(f"Heading: {heading}") diff --git a/tests/test_cli_help.py b/tests/test_cli_help.py index 4281b5b9..6bb1f119 100644 --- a/tests/test_cli_help.py +++ b/tests/test_cli_help.py @@ -50,3 +50,29 @@ def test_search_help_mentions_extra_scope(): assert result.exit_code == 0 assert "--extra-scope" in result.output assert "--only-scope" in result.output + + +def test_search_text_output_includes_scope_when_present(monkeypatch): + """When results carry a 'scope' field, the text output shows it.""" + from click.testing import CliRunner + + from memsearch import cli as cli_mod + + fake_results = [ + {"chunk_hash": "h1", "score": 0.9, "source": "/x.md", "heading": "H", "content": "hi", "scope": "global"}, + ] + + class FakeMS: + def __init__(self, *a, **kw): + pass + + async def search(self, *a, **kw): + return fake_results + + def close(self): + pass + + monkeypatch.setattr("memsearch.core.MemSearch", FakeMS) + runner = CliRunner() + result = runner.invoke(cli_mod.cli, ["search", "foo"]) + assert "scope: global" in result.output From a7306e98c76b08a38b969097ea7272a016a2fb24 Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 18:09:59 +0800 Subject: [PATCH 15/16] feat(cli): pass config-loaded scopes to MemSearch; CLI flags append Co-Authored-By: Claude Sonnet 4.6 --- src/memsearch/cli.py | 22 ++++++++++++++++++++-- tests/test_cli_config_helpers.py | 20 ++++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/src/memsearch/cli.py b/src/memsearch/cli.py index a79d29e2..4517b127 100644 --- a/src/memsearch/cli.py +++ b/src/memsearch/cli.py @@ -81,6 +81,19 @@ def _build_cli_overrides(**kwargs) -> dict: def _cfg_to_memsearch_kwargs(cfg: MemSearchConfig) -> dict: """Extract MemSearch constructor kwargs from a resolved config.""" + from .core import Scope + + extra_scopes = [ + Scope( + name=sc.name, + collection=sc.collection, + paths=list(sc.paths), + quota=sc.quota, + uri=sc.uri or None, + token=sc.token or None, + ) + for sc in cfg.scopes + ] return { "embedding_provider": cfg.embedding.provider, "embedding_model": cfg.embedding.model or None, @@ -93,6 +106,9 @@ def _cfg_to_memsearch_kwargs(cfg: MemSearchConfig) -> dict: "max_chunk_size": cfg.chunking.max_chunk_size, "overlap_lines": cfg.chunking.overlap_lines, "reranker_model": cfg.reranker.model, + "default_scope_name": cfg.default_scope.name, + "default_scope_quota": cfg.default_scope.quota, + "extra_scopes": extra_scopes, } @@ -259,11 +275,13 @@ def search( reranker_model=reranker_model, ) ) - extra_scopes = [_parse_extra_scope(v) for v in extra_scope] only_scope_list = [s.strip() for s in only_scope.split(",") if s.strip()] if only_scope else None ms = None try: - ms = MemSearch(**_cfg_to_memsearch_kwargs(cfg), extra_scopes=extra_scopes) + base_kwargs = _cfg_to_memsearch_kwargs(cfg) + cli_extra = [_parse_extra_scope(v) for v in extra_scope] + base_kwargs["extra_scopes"] = base_kwargs.get("extra_scopes", []) + cli_extra + ms = MemSearch(**base_kwargs) results = _run(ms.search(query, top_k=top_k or 5, source_prefix=source_prefix, only_scope=only_scope_list)) if json_output: click.echo(json.dumps(results, indent=2, ensure_ascii=False)) diff --git a/tests/test_cli_config_helpers.py b/tests/test_cli_config_helpers.py index 73485918..3732b3aa 100644 --- a/tests/test_cli_config_helpers.py +++ b/tests/test_cli_config_helpers.py @@ -84,4 +84,24 @@ def test_cfg_to_memsearch_kwargs_translates_resolved_config() -> None: "max_chunk_size": 1800, "overlap_lines": 4, "reranker_model": "", + "default_scope_name": "project", + "default_scope_quota": None, + "extra_scopes": [], } + + +def test_cfg_to_memsearch_kwargs_includes_scopes(): + from memsearch.cli import _cfg_to_memsearch_kwargs + from memsearch.config import DefaultScopeConfig, MemSearchConfig, ScopeConfig + + cfg = MemSearchConfig( + default_scope=DefaultScopeConfig(name="myproj", quota=5), + scopes=[ScopeConfig(name="global", collection="ms_global", quota=3)], + ) + kwargs = _cfg_to_memsearch_kwargs(cfg) + assert kwargs["default_scope_name"] == "myproj" + assert kwargs["default_scope_quota"] == 5 + assert len(kwargs["extra_scopes"]) == 1 + assert kwargs["extra_scopes"][0].name == "global" + assert kwargs["extra_scopes"][0].collection == "ms_global" + assert kwargs["extra_scopes"][0].quota == 3 From a76fbe17fe3c63136b36b6a2f5649220f64b1e1e Mon Sep 17 00:00:00 2001 From: 1TommyCheung Date: Sun, 3 May 2026 18:43:18 +0800 Subject: [PATCH 16/16] test(scenarios): multi-scope E2E validation harness using ONNX embeddings Three scenario-driven workflows that exercise multi-scope routing end-to-end without requiring any API key (uses the ONNX local embedding provider): 1. Solo dev (closes #337): project + global personal scopes, blended retrieval with quota enforcement and only_scope restriction. 2. Chat agents shared memory: a "registrar" indexes shared canon once; multiple agents (Alice, Bob) attach to it as a read-only scope (empty paths) while each writes to their own private scope. Verifies cross-agent privacy. 3. Individual isolation: two independent MemSearch instances on separate Milvus DBs cannot cross-leak. Single-scope behavior unchanged. Run via: uv run python scripts/scenario_validation.py --- scripts/scenario_validation.py | 345 +++++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 scripts/scenario_validation.py diff --git a/scripts/scenario_validation.py b/scripts/scenario_validation.py new file mode 100644 index 00000000..60ffa55d --- /dev/null +++ b/scripts/scenario_validation.py @@ -0,0 +1,345 @@ +"""Scenario-driven E2E validation of multi-scope memsearch. + +Runs three personas end-to-end with real ONNX embeddings (no API key). +Output is a transcript suitable for pasting into the PR thread as evidence. + +Personas: + 1. Solo dev (issue #337): project + global personal, blended retrieval + 2. Chat agents shared: agents share canon; each agent has private scope + 3. Individual: per-user private memory invisible to others +""" + +from __future__ import annotations + +import asyncio +import shutil +import tempfile +from pathlib import Path + +from memsearch.core import MemSearch, Scope + + +def _bar(label: str) -> None: + line = "=" * 78 + print(f"\n{line}\n {label}\n{line}") + + +def _section(label: str) -> None: + print(f"\n--- {label} ---") + + +def _show_results(label: str, results: list[dict]) -> None: + print(f"\n {label}: {len(results)} result(s)") + for i, r in enumerate(results, 1): + scope = r.get("scope", "—") + score = r.get("score", 0.0) + source = Path(r["source"]).name + snippet = r["content"][:80].replace("\n", " ") + print(f" {i}. [{scope:>10}] score={score:.3f} {source} «{snippet}»") + + +async def scenario_337_solo_dev(workdir: Path) -> None: + """#337: solo dev with project memory + global personal preferences. + + Setup: + - project/lazarus/ → ms_project_lazarus (deploy notes, fixes) + - personal/ → ms_personal (coding preferences) + + Verifies: + - Project queries surface project hits (highest priority) + - Cross-cutting queries surface BOTH project and personal hits with quota + - Querying from a different project still surfaces personal preferences + """ + _bar("SCENARIO 1: Solo dev (closes issue #337)") + + proj_dir = workdir / "project_lazarus" + pers_dir = workdir / "personal" + proj_dir.mkdir() + pers_dir.mkdir() + + (proj_dir / "deploy.md").write_text( + "# Lazarus Deployment\n\n" + "Deploy via scripts/deploy/bring_up_workspaces.sh. " + "The reproducibility-gate must use varied queries to bypass session cache.\n" + ) + (proj_dir / "bugfix.md").write_text( + "# Reproducibility Gate Bug\n\n" + "Fixed session cache by adding cache-busting query suffixes.\n" + ) + (pers_dir / "python_style.md").write_text( + "# My Python preferences\n\n" + "I prefer 4-space indentation. Always use type hints. Avoid implicit str→bytes coercion.\n" + ) + (pers_dir / "git_habits.md").write_text( + "# Git habits\n\nSquash-merge feature branches. Conventional commits. Sign commits with GPG.\n" + ) + + mem = MemSearch( + embedding_provider="onnx", + milvus_uri=str(workdir / "337.db"), + paths=[str(proj_dir)], + collection="ms_project_lazarus", + default_scope_quota=3, + extra_scopes=[ + Scope(name="personal", collection="ms_personal", paths=[str(pers_dir)], quota=2), + ], + ) + try: + _section("Indexing") + n = await mem.index() + print(f" Indexed {n} total chunks across scopes") + for sname, store in mem._stores.items(): + count = len(store.indexed_sources()) + print(f" {sname:>10}: {count} unique source(s)") + + _section("Query 1 — project-specific: 'how do I deploy lazarus'") + results = await mem.search("how do I deploy lazarus", top_k=4) + _show_results("Blended", results) + scopes_seen = {r["scope"] for r in results} + assert "project" in scopes_seen, "expected project scope in deploy query" + + _section("Query 2 — cross-cutting: 'python coding style'") + results = await mem.search("python coding style", top_k=4) + _show_results("Blended", results) + scopes_seen = {r["scope"] for r in results} + assert "personal" in scopes_seen, "expected personal scope to surface for style query" + + _section("Query 3 — restrict to personal only") + results = await mem.search("style", top_k=4, only_scope=["personal"]) + _show_results("only_scope=['personal']", results) + assert all(r["scope"] == "personal" for r in results) + + _section("Quota enforcement check") + results = await mem.search("deploy", top_k=10) + per_scope = {} + for r in results: + per_scope[r["scope"]] = per_scope.get(r["scope"], 0) + 1 + print(f" per-scope counts: {per_scope}") + print(f" configured quotas: project=3, personal=2") + assert per_scope.get("project", 0) <= 3 + assert per_scope.get("personal", 0) <= 2 + + print("\n ✓ Scenario 1 PASSED — solo dev with project + personal scopes works end-to-end") + finally: + mem.close() + + +async def scenario_chat_agents_shared(workdir: Path) -> None: + """Chat agents shared memory: agents read shared canon, write to private scopes. + + Setup: + - canon/ → ms_canon (read-only — populated once, shared by all) + - agent_alice_private/ → ms_alice_private (alice's private notes) + - agent_bob_private/ → ms_bob_private (bob's private notes) + + Verifies: + - Both agents see the same canon facts + - Each agent sees their OWN private notes but NOT the other's + - Read-only canon is searched but never indexed against (its files live in + a separate dir owned by a "registrar" process, not by the agents) + """ + _bar("SCENARIO 2: Chat agents — shared canon + per-agent private") + + canon_dir = workdir / "canon" + alice_dir = workdir / "agent_alice_private" + bob_dir = workdir / "agent_bob_private" + canon_dir.mkdir() + alice_dir.mkdir() + bob_dir.mkdir() + + # Canon facts (would be written by a "registrar" with access to canon_dir) + (canon_dir / "family_lore.md").write_text( + "# Family Lore\n\n" + "Cecil's name was changed from Clonk by Alice in 2024. " + "ZenCrabby is the canon owner. Tommy approves all canon changes.\n" + ) + (canon_dir / "world.md").write_text( + "# World\n\nThe Temple of Tobe is the family's main meeting place. Founded 2023.\n" + ) + + # Alice's private notes (only Alice can see these) + (alice_dir / "alice_notes.md").write_text( + "# Alice's private observations\n\n" + "Tommy seemed grumpy about gateway latency today. Bob asked about temple history again.\n" + ) + + # Bob's private notes (only Bob can see these) + (bob_dir / "bob_notes.md").write_text( + "# Bob's private observations\n\n" + "Cecil mentioned wanting to revisit the renaming. Alice was checking deployment timing.\n" + ) + + # ----- Step 1: index canon ONCE via a "registrar" instance ----- + registrar = MemSearch( + embedding_provider="onnx", + milvus_uri=str(workdir / "shared.db"), + paths=[str(canon_dir)], + collection="ms_canon", + ) + try: + n = await registrar.index() + print(f"\n Registrar indexed canon: {n} chunks") + finally: + registrar.close() + + # ----- Step 2: Alice's MemSearch — canon is READ-ONLY (no paths), private is writable ----- + alice = MemSearch( + embedding_provider="onnx", + milvus_uri=str(workdir / "shared.db"), # same Milvus, different collections + paths=[str(alice_dir)], + collection="ms_alice_private", + default_scope_name="alice_private", + default_scope_quota=2, + extra_scopes=[ + Scope(name="canon", collection="ms_canon", paths=[], quota=2), # read-only + ], + ) + bob = MemSearch( + embedding_provider="onnx", + milvus_uri=str(workdir / "shared.db"), + paths=[str(bob_dir)], + collection="ms_bob_private", + default_scope_name="bob_private", + default_scope_quota=2, + extra_scopes=[ + Scope(name="canon", collection="ms_canon", paths=[], quota=2), # read-only + ], + ) + + try: + _section("Indexing private scopes (canon NOT re-indexed by agents — read-only scope)") + await alice.index() + await bob.index() + + # Verify canon was NOT indexed by alice/bob (read-only scope = no paths) + # Their "canon" scope's collection sources came from registrar only + alice_canon_count = len(alice._stores["canon"].indexed_sources()) + bob_canon_count = len(bob._stores["canon"].indexed_sources()) + print(f" Alice's view of canon: {alice_canon_count} sources (registrar's work)") + print(f" Bob's view of canon: {bob_canon_count} sources (registrar's work)") + assert alice_canon_count == 2 and bob_canon_count == 2, "agents see canon via shared collection" + + _section("Query 'temple of tobe' via Alice — should surface canon") + results = await alice.search("temple of tobe", top_k=4) + _show_results("Alice's blended results", results) + scopes_seen = {r["scope"] for r in results} + assert "canon" in scopes_seen, "Alice should see canon facts" + # Alice should NOT see Bob's notes (different collection, not in her config) + assert not any("bob_notes" in r["source"] for r in results), "Alice must not see Bob's private notes" + + _section("Query 'gateway latency' via Alice — should surface Alice's private") + results = await alice.search("gateway latency observations", top_k=4) + _show_results("Alice's blended results", results) + # The hit should be Alice's private note + alice_private_hits = [r for r in results if r["scope"] == "alice_private"] + assert alice_private_hits, "Alice should see her own private observations" + + _section("Query 'gateway latency' via Bob — should NOT surface Alice's private") + results = await bob.search("gateway latency observations", top_k=4) + _show_results("Bob's blended results", results) + # Bob's results MUST NOT contain anything from alice_dir + assert not any("alice_notes" in r["source"] for r in results), \ + "PRIVACY VIOLATION: Bob saw Alice's private notes!" + print(" ✓ Privacy preserved: Bob cannot see Alice's private notes") + + _section("Query 'cecil' via Bob — should surface canon + Bob's private") + results = await bob.search("cecil renaming", top_k=4) + _show_results("Bob's blended results", results) + scopes_seen = {r["scope"] for r in results} + # Should have BOTH canon and bob_private + print(f" Scopes returned: {scopes_seen}") + + print("\n ✓ Scenario 2 PASSED — shared canon read by both agents; private scopes are isolated") + finally: + alice.close() + bob.close() + + +async def scenario_individual_isolation(workdir: Path) -> None: + """Individual: per-user memory pools that must not leak across users. + + Setup: + - user_alice/ → ms_user_alice (Alice's project work) + - user_bob/ → ms_user_bob (Bob's project work) + + Each user runs their own MemSearch with their own collection. Verifies + that one user's queries cannot reach another user's collection unless + explicitly configured. + """ + _bar("SCENARIO 3: Individual user isolation") + + alice_dir = workdir / "user_alice" + bob_dir = workdir / "user_bob" + alice_dir.mkdir() + bob_dir.mkdir() + + (alice_dir / "alice_secret.md").write_text( + "# Alice's secret project\n\nAPI key rotation schedule: every 90 days. Notify ops.\n" + ) + (bob_dir / "bob_secret.md").write_text( + "# Bob's secret project\n\nDatabase migration plan: dry-run on staging first.\n" + ) + + alice = MemSearch( + embedding_provider="onnx", + milvus_uri=str(workdir / "indiv_alice.db"), + paths=[str(alice_dir)], + collection="ms_user_alice", + ) + bob = MemSearch( + embedding_provider="onnx", + milvus_uri=str(workdir / "indiv_bob.db"), + paths=[str(bob_dir)], + collection="ms_user_bob", + ) + + try: + _section("Indexing per-user") + await alice.index() + await bob.index() + + _section("Alice queries her own data") + results = await alice.search("API key rotation", top_k=3) + _show_results("Alice's results (single-scope, no scope tag)", results) + # Single-scope mode: no 'scope' field on results + assert results + assert "scope" not in results[0], "single-scope must not add scope tag" + assert any("alice_secret" in r["source"] for r in results) + + _section("Alice queries Bob's data — should return nothing") + results = await alice.search("database migration plan", top_k=3) + _show_results("Alice's results", results) + # Alice's query against her own collection should NOT find bob's content + assert not any("bob_secret" in r["source"] for r in results), \ + "PRIVACY VIOLATION: Alice's query reached Bob's collection!" + print(" ✓ Isolation preserved: separate Milvus DBs and collections cannot cross-leak") + + _section("Bob queries his own data") + results = await bob.search("database migration plan", top_k=3) + _show_results("Bob's results", results) + assert any("bob_secret" in r["source"] for r in results) + + print("\n ✓ Scenario 3 PASSED — per-user isolation works (single-scope mode unchanged)") + finally: + alice.close() + bob.close() + + +async def main() -> None: + workdir = Path(tempfile.mkdtemp(prefix="memsearch_scenario_")) + print(f"Workdir: {workdir}") + try: + for sub in ("scenario1", "scenario2", "scenario3"): + (workdir / sub).mkdir() + await scenario_337_solo_dev(workdir / "scenario1") + await scenario_chat_agents_shared(workdir / "scenario2") + await scenario_individual_isolation(workdir / "scenario3") + _bar("ALL SCENARIOS PASSED ✓") + finally: + shutil.rmtree(workdir, ignore_errors=True) + + +if __name__ == "__main__": + # Each scenario uses its own subdir so workdirs don't collide + Path(tempfile.gettempdir()).mkdir(exist_ok=True) + asyncio.run(main())