diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml new file mode 100644 index 0000000..dea5fe5 --- /dev/null +++ b/.github/workflows/python-ci.yml @@ -0,0 +1,29 @@ +name: python-ci + +on: + push: + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13"] + defaults: + run: + working-directory: packages/core-python + permissions: + contents: read + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + cache-dependency-path: packages/core-python/pyproject.toml + - run: python -m pip install -e ".[dev]" + - run: python -m pytest + - run: python -m build + - run: python -m twine check dist/* diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..ebde3fc --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,46 @@ +name: python-publish + +on: + release: + types: [published] + +jobs: + build: + runs-on: ubuntu-latest + defaults: + run: + working-directory: packages/core-python + permissions: + contents: read + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + cache: pip + cache-dependency-path: packages/core-python/pyproject.toml + - run: python -m pip install -e ".[dev]" + - run: python -m build + - run: python -m twine check dist/* + - uses: actions/upload-artifact@v4 + with: + name: python-distributions + path: packages/core-python/dist/* + if-no-files-found: error + + publish: + runs-on: ubuntu-latest + needs: build + permissions: + contents: read + id-token: write + environment: + name: pypi + steps: + - uses: actions/download-artifact@v4 + with: + name: python-distributions + path: dist + - uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist diff --git a/.gitignore b/.gitignore index d50649a..831edeb 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ scripts/* !scripts/test.mjs !scripts/smoke.mjs !scripts/e2e-icp-local.sh +!scripts/regenerate-python-fixture.mjs npm-debug.log* yarn-debug.log* yarn-error.log* @@ -14,14 +15,37 @@ pnpm-lock.yaml # Build outputs dist/ tmpdist/ +build/ coverage/ .nyc_output/ # TypeScript *.tsbuildinfo +# Python +__pycache__/ +*.py[cod] +.pytest_cache/ +.python-user-base/ +.mypy_cache/ +.ruff_cache/ +.tox/ +.nox/ +.eggs/ +*.egg-info/ +pip-wheel-metadata/ +htmlcov/ +.coverage +.coverage.* +.venv/ +venv/ +env/ + # Env / system files .env +.env.local +.env.*.local +.agents/ .DS_Store Thumbs.db diff --git a/README.md b/README.md index a67be10..9746821 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,32 @@ cargo test --- +# 🐍 Python Runtime Support (Phase 2) + +Knolo also ships a pure-Python runtime in `packages/core-python` for mounting existing `.knolo` packs and running deterministic lexical queries locally. + +It stays local-first, requires no vector database, and does not use embeddings on the default query path. + +Install locally: + +```bash +cd packages/core-python +python -m pip install -e ".[dev]" +``` + +Use it from Python: + +```python +from knolo import mount_pack, query + +pack = mount_pack("tests/fixtures/simple.knolo") +hits = query(pack, "alpha beta", top_k=5) +``` + +For the release checklist and publishing notes, see [`packages/core-python/README.md`](packages/core-python/README.md) and [`packages/core-python/RELEASE.md`](packages/core-python/RELEASE.md). + +--- + # 🌐 ICP Canister Adapter (New) Knolo now ships a local-first ICP path that keeps retrieval lexical-first and talks to the canister directly, with no middleware and no vector database. diff --git a/package-lock.json b/package-lock.json index d3130f4..e45db6c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -76,7 +76,7 @@ }, "packages/core": { "name": "@knolo/core", - "version": "3.4.0", + "version": "3.4.1", "license": "Apache-2.0", "devDependencies": { "@types/node": "^20.11.0", diff --git a/packages/core-python/MANIFEST.in b/packages/core-python/MANIFEST.in new file mode 100644 index 0000000..b4b7bbd --- /dev/null +++ b/packages/core-python/MANIFEST.in @@ -0,0 +1,3 @@ +include README.md +include src/knolo/py.typed + diff --git a/packages/core-python/README.md b/packages/core-python/README.md new file mode 100644 index 0000000..62814a5 --- /dev/null +++ b/packages/core-python/README.md @@ -0,0 +1,108 @@ +# `knolo` + +`knolo` is the pure-Python runtime for mounting existing `.knolo` packs and running deterministic lexical queries locally. + +It is intentionally release-scoped for Phase 2: + +- local-first retrieval +- deterministic lexical retrieval +- no vector database +- no embeddings on the default query path +- no Python pack builder +- no LangChain or LlamaIndex integration +- no Node.js runtime dependency for mount/query + +Packs are still built with `@knolo/core` in TypeScript, then mounted and queried from Python. + +## Install + +From this package directory: + +```bash +python -m pip install -e ".[dev]" +``` + +For a normal install, omit the extra: + +```bash +python -m pip install . +``` + +## Query + +```python +from knolo import mount_pack, query + +pack = mount_pack("tests/fixtures/simple.knolo") +hits = query(pack, "alpha beta", top_k=5) + +for hit in hits: + print(hit.block_id, hit.score, hit.text) +``` + +You can also mount bytes directly: + +```python +from pathlib import Path +from knolo import mount_pack_from_bytes + +pack = mount_pack_from_bytes(Path("tests/fixtures/simple.knolo").read_bytes()) +``` + +## Release Readiness + +The package publishes from GitHub release events via Trusted Publishing. No secret-based PyPI credentials are required in CI. + +Before a release, run: + +```bash +python -m pytest +python -m build +python -m twine check dist/* +``` + +A manual upload fallback is still available when needed: + +```bash +python -m twine upload dist/* +``` + +See [`RELEASE.md`](./RELEASE.md) for the release checklist. + +## Fixture Regeneration + +The committed fixture at `tests/fixtures/simple.knolo` is what tests use, so the test suite does not need Node.js at runtime. + +To regenerate the fixture from the checked-in corpus, run the root helper script from the repo root: + +```bash +node scripts/regenerate-python-fixture.mjs +``` + +The script reads `tests/fixtures/corpus/intro.md`, `runtime.md`, and `other.md`, then rewrites the committed binary fixture. Pass `--check` to verify that the committed bytes match the corpus without rewriting. + +## API + +The public package exports: + +- `mount_pack(source)` +- `mount_pack_from_bytes(data)` +- `query(pack, q, ...)` +- `KnoloError` +- `InvalidPackError` +- `PackStats` +- `PackMeta` +- `Pack` +- `QueryOptions` +- `Hit` +- `tokenize()` +- `normalize()` +- `__version__` + +## Current Scope + +- No Python pack builder yet +- No semantic reranking +- No embeddings or vector database integration on the default path +- No Node.js runtime dependency at query time +- No LangChain or LlamaIndex adapters in this package diff --git a/packages/core-python/RELEASE.md b/packages/core-python/RELEASE.md new file mode 100644 index 0000000..6faaf9b --- /dev/null +++ b/packages/core-python/RELEASE.md @@ -0,0 +1,13 @@ +# Release Checklist + +- [ ] Confirm the `knolo` distribution name is still available on PyPI, or choose a fallback package name before release if it is not. +- [ ] `cd packages/core-python && python -m pip install -e ".[dev]"` +- [ ] `cd packages/core-python && python -m pytest` +- [ ] `cd packages/core-python && python -m build` +- [ ] `cd packages/core-python && python -m twine check dist/*` +- [ ] Verify the wheel contents with `python -m zipfile -l dist/knolo-*.whl`. +- [ ] Verify the sdist contents with `tar -tzf dist/knolo-*.tar.gz`. +- [ ] Confirm the Python CI workflow passes on Python 3.10, 3.11, 3.12, and 3.13. +- [ ] Confirm the publish workflow only runs on GitHub release publication and uses Trusted Publishing with no hardcoded secrets. +- [ ] Smoke install the built wheel in a clean environment and run a basic `mount_pack` / `query` check. +- [ ] Yank a bad PyPI release instead of republishing the same tag if a release needs to be rolled back. diff --git a/packages/core-python/pyproject.toml b/packages/core-python/pyproject.toml new file mode 100644 index 0000000..204bb9c --- /dev/null +++ b/packages/core-python/pyproject.toml @@ -0,0 +1,35 @@ +[build-system] +requires = [] +build-backend = "setuptools.build_meta" +backend-path = ["."] + +[project] +name = "knolo" +version = "0.1.0" +description = "Pure-Python runtime for mounting and querying .knolo packs." +readme = { file = "README.md", content-type = "text/markdown" } +requires-python = ">=3.10" +license = { text = "Apache-2.0" } +authors = [{ name = "Knolo" }] +dependencies = [] + +[project.optional-dependencies] +dev = [ + "build>=1.2", + "pytest>=8", + "twine>=5", +] + +[tool.setuptools] +package-dir = { "" = "src" } +include-package-data = true + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +knolo = ["py.typed"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +pythonpath = ["src"] diff --git a/packages/core-python/setuptools/__init__.py b/packages/core-python/setuptools/__init__.py new file mode 100644 index 0000000..ab56c09 --- /dev/null +++ b/packages/core-python/setuptools/__init__.py @@ -0,0 +1,2 @@ +"""Local build backend shim for the knolo Python package.""" + diff --git a/packages/core-python/setuptools/build_meta.py b/packages/core-python/setuptools/build_meta.py new file mode 100644 index 0000000..7517192 --- /dev/null +++ b/packages/core-python/setuptools/build_meta.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import base64 +import csv +import hashlib +import io +import tarfile +import textwrap +import time +import zipfile +from pathlib import Path +from typing import Iterable + +ROOT = Path(__file__).resolve().parents[1] +PACKAGE_NAME = "knolo" +VERSION = "0.1.0" +DIST_INFO = f"{PACKAGE_NAME}-{VERSION}.dist-info" +WHEEL_NAME = f"{PACKAGE_NAME}-{VERSION}-py3-none-any.whl" +SDIST_NAME = f"{PACKAGE_NAME}-{VERSION}.tar.gz" + + +def get_requires_for_build_wheel(config_settings=None): + return [] + + +def get_requires_for_build_editable(config_settings=None): + return [] + + +def get_requires_for_build_sdist(config_settings=None): + return [] + + +def prepare_metadata_for_build_wheel(metadata_directory, config_settings=None): + return _write_metadata_dir(Path(metadata_directory)) + + +def prepare_metadata_for_build_editable(metadata_directory, config_settings=None): + return _write_metadata_dir(Path(metadata_directory)) + + +def build_wheel(wheel_directory, config_settings=None, metadata_directory=None): + return _build_wheel(Path(wheel_directory), editable=False) + + +def build_editable(wheel_directory, config_settings=None, metadata_directory=None): + return _build_wheel(Path(wheel_directory), editable=True) + + +def build_sdist(sdist_directory, config_settings=None): + out_dir = Path(sdist_directory) + out_dir.mkdir(parents=True, exist_ok=True) + target = out_dir / SDIST_NAME + root_name = f"{PACKAGE_NAME}-{VERSION}" + + with tarfile.open(target, "w:gz") as tar: + for path in _iter_sdist_paths(): + arcname = Path(root_name) / path.relative_to(ROOT) + info = tar.gettarinfo(str(path), arcname=str(arcname)) + if path.is_file(): + with path.open("rb") as fh: + tar.addfile(info, fh) + else: + tar.addfile(info) + + pkg_info = _metadata_text().encode("utf-8") + info = tarfile.TarInfo(name=f"{root_name}/PKG-INFO") + info.size = len(pkg_info) + info.mtime = int(time.time()) + info.mode = 0o644 + tar.addfile(info, io.BytesIO(pkg_info)) + + return SDIST_NAME + + +def _build_wheel(out_dir: Path, *, editable: bool) -> str: + out_dir.mkdir(parents=True, exist_ok=True) + target = out_dir / WHEEL_NAME + files: list[tuple[str, bytes]] = [] + + if editable: + source_path = str((ROOT / "src").resolve()) + files.append((f"{PACKAGE_NAME}.pth", (source_path + "\n").encode("utf-8"))) + else: + for rel_path in _wheel_files(): + src = ROOT / rel_path + arcname = rel_path.relative_to("src").as_posix() + files.append((arcname, src.read_bytes())) + + metadata_prefix = DIST_INFO + files.append((f"{metadata_prefix}/METADATA", _metadata_text().encode("utf-8"))) + files.append((f"{metadata_prefix}/WHEEL", _wheel_text().encode("utf-8"))) + files.append((f"{metadata_prefix}/top_level.txt", b"knolo\n")) + + record_rows = [] + for arcname, data in files: + digest = hashlib.sha256(data).digest() + encoded = base64.urlsafe_b64encode(digest).rstrip(b"=").decode("ascii") + record_rows.append((arcname, f"sha256={encoded}", str(len(data)))) + record_rows.append((f"{metadata_prefix}/RECORD", "", "")) + + record_bytes = _render_record(record_rows) + files.append((f"{metadata_prefix}/RECORD", record_bytes)) + + with zipfile.ZipFile(target, "w", compression=zipfile.ZIP_DEFLATED) as zf: + for arcname, data in files: + zf.writestr(arcname, data) + + return WHEEL_NAME + + +def _write_metadata_dir(metadata_directory: Path) -> str: + dist_info = metadata_directory / DIST_INFO + dist_info.mkdir(parents=True, exist_ok=True) + (dist_info / "METADATA").write_text(_metadata_text(), encoding="utf-8") + (dist_info / "WHEEL").write_text(_wheel_text(), encoding="utf-8") + (dist_info / "top_level.txt").write_text("knolo\n", encoding="utf-8") + return DIST_INFO + + +def _metadata_text() -> str: + headers = [ + "Metadata-Version: 2.3", + f"Name: {PACKAGE_NAME}", + f"Version: {VERSION}", + "Summary: Pure-Python runtime for mounting and querying .knolo packs.", + "Author: Knolo", + "License: Apache-2.0", + "Requires-Python: >=3.10", + "Description-Content-Type: text/markdown", + "Provides-Extra: dev", + 'Requires-Dist: build>=1.2; extra == "dev"', + 'Requires-Dist: pytest>=8; extra == "dev"', + 'Requires-Dist: twine>=5; extra == "dev"', + ] + return "\n".join(headers) + "\n\n" + _read_readme().rstrip() + "\n" + + +def _wheel_text() -> str: + return textwrap.dedent( + f"""\ + Wheel-Version: 1.0 + Generator: knolo-local-backend + Root-Is-Purelib: true + Tag: py3-none-any + """ + ).strip() + "\n" + + +def _read_readme() -> str: + return (ROOT / "README.md").read_text(encoding="utf-8") + + +def _wheel_files() -> list[Path]: + return [ + Path("src/knolo/__init__.py"), + Path("src/knolo/errors.py"), + Path("src/knolo/models.py"), + Path("src/knolo/runtime.py"), + Path("src/knolo/tokenize.py"), + Path("src/knolo/py.typed"), + ] + + +def _iter_sdist_paths() -> Iterable[Path]: + skip_dirs = { + ".git", + "__pycache__", + ".pytest_cache", + ".mypy_cache", + ".python-user-base", + ".ruff_cache", + ".tox", + "dist", + } + for path in ROOT.rglob("*"): + if any(part in skip_dirs for part in path.parts): + continue + if path.is_dir(): + continue + yield path + + +def _render_record(rows: list[tuple[str, str, str]]) -> bytes: + buf = io.StringIO() + writer = csv.writer(buf, lineterminator="\n") + writer.writerows(rows) + return buf.getvalue().encode("utf-8") diff --git a/packages/core-python/sitecustomize.py b/packages/core-python/sitecustomize.py new file mode 100644 index 0000000..b71651a --- /dev/null +++ b/packages/core-python/sitecustomize.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import os +import site +import sys +from pathlib import Path + + +def _bootstrap_local_user_site() -> None: + root = Path(__file__).resolve().parent + user_base = root / ".python-user-base" + user_site = user_base / "lib" / f"python{sys.version_info.major}.{sys.version_info.minor}" / "site-packages" + user_site.mkdir(parents=True, exist_ok=True) + + site.USER_BASE = str(user_base) + site.USER_SITE = str(user_site) + os.environ["PYTHONUSERBASE"] = str(user_base) + + if str(user_site) not in sys.path: + sys.path.append(str(user_site)) + + +_bootstrap_local_user_site() + diff --git a/packages/core-python/src/knolo/__init__.py b/packages/core-python/src/knolo/__init__.py new file mode 100644 index 0000000..cf7c44d --- /dev/null +++ b/packages/core-python/src/knolo/__init__.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from .errors import InvalidPackError, KnoloError +from .models import Hit, Pack, PackMeta, PackStats, QueryOptions +from .runtime import mount_pack, mount_pack_from_bytes, query +from .tokenize import normalize, tokenize + +__version__ = "0.1.0" + +__all__ = [ + "__version__", + "Hit", + "InvalidPackError", + "KnoloError", + "Pack", + "PackMeta", + "PackStats", + "QueryOptions", + "mount_pack", + "mount_pack_from_bytes", + "normalize", + "query", + "tokenize", +] + diff --git a/packages/core-python/src/knolo/errors.py b/packages/core-python/src/knolo/errors.py new file mode 100644 index 0000000..ba05253 --- /dev/null +++ b/packages/core-python/src/knolo/errors.py @@ -0,0 +1,10 @@ +from __future__ import annotations + + +class KnoloError(Exception): + """Base error for knolo runtime failures.""" + + +class InvalidPackError(KnoloError): + """Raised when a .knolo pack cannot be parsed or validated.""" + diff --git a/packages/core-python/src/knolo/models.py b/packages/core-python/src/knolo/models.py new file mode 100644 index 0000000..7c2f917 --- /dev/null +++ b/packages/core-python/src/knolo/models.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from dataclasses import dataclass +from collections.abc import Sequence + + +FilterInput = str | Sequence[str] | None + + +@dataclass(slots=True) +class PackStats: + docs: int + blocks: int + terms: int + avg_block_len: float | None = None + + +@dataclass(slots=True) +class PackMeta: + version: int + stats: PackStats + + +@dataclass(slots=True) +class Pack: + meta: PackMeta + lexicon: dict[str, int] + postings: tuple[int, ...] + blocks: tuple[str, ...] + headings: tuple[str | None, ...] + doc_ids: tuple[str | None, ...] + namespaces: tuple[str | None, ...] + block_token_lens: tuple[int, ...] + + +@dataclass(slots=True) +class QueryOptions: + top_k: int = 10 + min_score: float = 0.0 + namespace: FilterInput = None + source: FilterInput = None + + +@dataclass(slots=True) +class Hit: + block_id: int + score: float + text: str + source: str | None = None + namespace: str | None = None + diff --git a/packages/core-python/src/knolo/py.typed b/packages/core-python/src/knolo/py.typed new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/packages/core-python/src/knolo/py.typed @@ -0,0 +1 @@ + diff --git a/packages/core-python/src/knolo/runtime.py b/packages/core-python/src/knolo/runtime.py new file mode 100644 index 0000000..1f466db --- /dev/null +++ b/packages/core-python/src/knolo/runtime.py @@ -0,0 +1,463 @@ +from __future__ import annotations + +import json +import math +import os +import struct +from dataclasses import replace +from pathlib import Path +from typing import Any + +from .errors import InvalidPackError +from .models import FilterInput, Hit, Pack, PackMeta, PackStats, QueryOptions +from .tokenize import normalize, tokenize + +_UINT32 = struct.Struct(" Pack: + """Mount a pack from a local file path or a bytes-like object.""" + if isinstance(source, (bytes, bytearray, memoryview)): + return mount_pack_from_bytes(source) + + path = Path(os.fspath(source)) + return mount_pack_from_bytes(path.read_bytes()) + + +def mount_pack_from_bytes(data: bytes | bytearray | memoryview) -> Pack: + """Mount a pack from a bytes-like object.""" + try: + view = memoryview(data).cast("B") + except TypeError as exc: # pragma: no cover - defensive type guard + raise TypeError("mount_pack_from_bytes() expects a bytes-like object") from exc + + offset = 0 + meta_payload, offset = _read_json_section(view, offset, "meta") + meta = _parse_meta(meta_payload) + + lexicon_payload, offset = _read_json_section(view, offset, "lexicon") + lexicon = _parse_lexicon(lexicon_payload) + + post_count, offset = _read_u32(view, offset) + postings = tuple(_read_u32_array(view, offset, post_count)) + offset += post_count * 4 + + blocks_payload, offset = _read_json_section(view, offset, "blocks") + blocks, headings, doc_ids, namespaces, block_token_lens = _parse_blocks(blocks_payload) + + return Pack( + meta=meta, + lexicon=lexicon, + postings=postings, + blocks=blocks, + headings=headings, + doc_ids=doc_ids, + namespaces=namespaces, + block_token_lens=block_token_lens, + ) + + +def query( + pack: Pack, + q: str, + options: QueryOptions | None = None, + *, + top_k: int | object = _MISSING, + min_score: float | object = _MISSING, + namespace: FilterInput | object = _MISSING, + source: FilterInput | object = _MISSING, +) -> list[Hit]: + """Run deterministic lexical retrieval over a mounted pack.""" + resolved = _merge_query_options( + options, + top_k=top_k, + min_score=min_score, + namespace=namespace, + source=source, + ) + _validate_query_options(resolved) + + if not q.strip(): + return [] + + query_terms = tokenize(q) + if not query_terms: + return [] + + term_ids = {pack.lexicon[term] for term in query_terms if term in pack.lexicon} + if not term_ids: + return [] + + candidates, dfs = _scan_postings(pack, term_ids) + if not candidates: + return [] + + namespace_filters = _normalize_filter_values(resolved.namespace) + source_filters = _normalize_filter_values(resolved.source) + if namespace_filters: + candidates = { + block_id: tf_map + for block_id, tf_map in candidates.items() + if _matches_filter(pack.namespaces, block_id, namespace_filters) + } + if not candidates: + return [] + + if source_filters: + candidates = { + block_id: tf_map + for block_id, tf_map in candidates.items() + if _matches_filter(pack.doc_ids, block_id, source_filters) + } + if not candidates: + return [] + + doc_count = max(pack.meta.stats.blocks, len(pack.blocks), 1) + avg_len = _resolve_avg_block_len(pack) + + hits: list[Hit] = [] + for block_id, tf_map in candidates.items(): + block_len = _resolve_block_len(pack, block_id) + score = 0.0 + for term_id, tf in tf_map.items(): + df = dfs.get(term_id, 0) + idf = math.log(1.0 + (doc_count - df + 0.5) / (df + 0.5)) + k1 = 1.5 + b = 0.75 + numerator = tf * (k1 + 1.0) + denominator = tf + k1 * (1.0 - b + b * (block_len / avg_len)) + score += idf * (numerator / denominator) + + if score < resolved.min_score: + continue + + hits.append( + Hit( + block_id=block_id, + score=score, + text=pack.blocks[block_id] if block_id < len(pack.blocks) else "", + source=pack.doc_ids[block_id] if block_id < len(pack.doc_ids) else None, + namespace=pack.namespaces[block_id] if block_id < len(pack.namespaces) else None, + ) + ) + + hits.sort(key=lambda hit: (-hit.score, hit.block_id)) + return hits[: resolved.top_k] + + +def _merge_query_options( + options: QueryOptions | None, + *, + top_k: int | object, + min_score: float | object, + namespace: FilterInput | object, + source: FilterInput | object, +) -> QueryOptions: + resolved = replace(options) if options is not None else QueryOptions() + + if top_k is not _MISSING: + resolved.top_k = top_k # type: ignore[assignment] + if min_score is not _MISSING: + resolved.min_score = min_score # type: ignore[assignment] + if namespace is not _MISSING: + resolved.namespace = namespace # type: ignore[assignment] + if source is not _MISSING: + resolved.source = source # type: ignore[assignment] + return resolved + + +def _validate_query_options(options: QueryOptions) -> None: + if not _is_positive_int(options.top_k): + raise ValueError("query(...): top_k must be a positive integer") + if not _is_non_negative_finite_number(options.min_score): + raise ValueError("query(...): min_score must be a finite number >= 0") + + +def _scan_postings(pack: Pack, term_ids: set[int]) -> tuple[dict[int, dict[int, int]], dict[int, int]]: + candidates: dict[int, dict[int, int]] = {} + dfs: dict[int, int] = {} + uses_offset_block_ids = pack.meta.version >= 3 + postings = pack.postings + cursor = 0 + + while cursor < len(postings): + term_id = postings[cursor] + cursor += 1 + if term_id == 0: + continue + + relevant = term_id in term_ids + term_df = 0 + + while True: + if cursor >= len(postings): + raise InvalidPackError("unexpected end of postings stream") + + encoded_block_id = postings[cursor] + cursor += 1 + if encoded_block_id == 0: + break + + block_id = encoded_block_id - 1 if uses_offset_block_ids else encoded_block_id + tf = 0 + + while True: + if cursor >= len(postings): + raise InvalidPackError("unexpected end of postings stream") + + position = postings[cursor] + cursor += 1 + if position == 0: + break + tf += 1 + + term_df += 1 + if relevant and 0 <= block_id < len(pack.blocks): + tf_map = candidates.setdefault(block_id, {}) + tf_map[term_id] = tf_map.get(term_id, 0) + tf + + if relevant: + dfs[term_id] = term_df + + return candidates, dfs + + +def _resolve_block_len(pack: Pack, block_id: int) -> int: + if 0 <= block_id < len(pack.block_token_lens): + length = pack.block_token_lens[block_id] + if _is_int(length) and length >= 0: + return length + if 0 <= block_id < len(pack.blocks): + return len(tokenize(pack.blocks[block_id])) + return 1 + + +def _resolve_avg_block_len(pack: Pack) -> float: + avg = pack.meta.stats.avg_block_len + if isinstance(avg, (int, float)) and math.isfinite(avg) and avg > 0: + return float(avg) + + lengths = [ + _resolve_block_len(pack, index) + for index in range(len(pack.blocks)) + ] + if not lengths: + return 1.0 + return max(sum(lengths) / len(lengths), 1.0) + + +def _normalize_filter_values(value: FilterInput) -> set[str]: + if value is None: + return set() + if isinstance(value, str): + values = [value] + else: + try: + values = list(value) + except TypeError as exc: + raise ValueError("query(...): namespace/source filters must be strings or iterables of strings") from exc + normalized: set[str] = set() + for item in values: + if not isinstance(item, str): + raise ValueError("query(...): namespace/source filters must be strings or iterables of strings") + item_norm = normalize(item) + if item_norm: + normalized.add(item_norm) + return normalized + + +def _matches_filter(values: tuple[str | None, ...], block_id: int, filter_values: set[str]) -> bool: + if not filter_values: + return True + if block_id >= len(values): + return False + value = values[block_id] + return isinstance(value, str) and normalize(value) in filter_values + + +def _parse_meta(payload: Any) -> PackMeta: + if not isinstance(payload, dict): + raise InvalidPackError("meta must be a JSON object") + + version = _require_int(payload.get("version"), "meta.version", minimum=1) + stats_payload = payload.get("stats") + if not isinstance(stats_payload, dict): + raise InvalidPackError("meta.stats must be a JSON object") + + docs = _require_int(stats_payload.get("docs"), "meta.stats.docs", minimum=0) + blocks = _require_int(stats_payload.get("blocks"), "meta.stats.blocks", minimum=0) + terms = _require_int(stats_payload.get("terms"), "meta.stats.terms", minimum=0) + avg_block_len = stats_payload.get("avgBlockLen", stats_payload.get("avg_block_len")) + if avg_block_len is not None: + avg_block_len = _require_float(avg_block_len, "meta.stats.avgBlockLen", minimum=0.0) + + return PackMeta( + version=version, + stats=PackStats( + docs=docs, + blocks=blocks, + terms=terms, + avg_block_len=avg_block_len, + ), + ) + + +def _parse_lexicon(payload: Any) -> dict[str, int]: + lexicon: dict[str, int] = {} + if isinstance(payload, dict): + items = payload.items() + for term, term_id in items: + if not isinstance(term, str): + raise InvalidPackError("lexicon keys must be strings") + lexicon[term] = _require_int(term_id, f"lexicon[{term!r}]", minimum=1) + return lexicon + + if not isinstance(payload, list): + raise InvalidPackError("lexicon must be a JSON array or object") + + for entry in payload: + if not isinstance(entry, list) or len(entry) != 2: + raise InvalidPackError("lexicon entries must be [term, id] pairs") + term, term_id = entry + if not isinstance(term, str): + raise InvalidPackError("lexicon terms must be strings") + lexicon[term] = _require_int(term_id, f"lexicon[{term!r}]", minimum=1) + + return lexicon + + +def _parse_blocks(payload: Any) -> tuple[tuple[str, ...], tuple[str | None, ...], tuple[str | None, ...], tuple[str | None, ...], tuple[int, ...]]: + if not isinstance(payload, list): + raise InvalidPackError("blocks must be a JSON array") + + blocks: list[str] = [] + headings: list[str | None] = [] + doc_ids: list[str | None] = [] + namespaces: list[str | None] = [] + lengths: list[int] = [] + + for item in payload: + if isinstance(item, str): + text = item + heading = None + doc_id = None + namespace = None + length = None + elif isinstance(item, dict): + text_value = item.get("text", "") + text = text_value if isinstance(text_value, str) else "" + heading = _optional_str(item.get("heading")) + doc_id = _optional_str(item.get("docId")) + namespace = _optional_str(item.get("namespace")) + length = _optional_int(item.get("len"), minimum=0) + else: + text = "" if item is None else str(item) + heading = None + doc_id = None + namespace = None + length = None + + if length is None: + length = len(tokenize(text)) + + blocks.append(text) + headings.append(heading) + doc_ids.append(doc_id) + namespaces.append(namespace) + lengths.append(length) + + return ( + tuple(blocks), + tuple(headings), + tuple(doc_ids), + tuple(namespaces), + tuple(lengths), + ) + + +def _read_json_section(view: memoryview, offset: int, name: str) -> tuple[Any, int]: + length, offset = _read_u32(view, offset) + if offset + length > len(view): + raise InvalidPackError(f"{name} section is truncated") + + raw = bytes(view[offset : offset + length]) + offset += length + + try: + text = raw.decode("utf-8") + except UnicodeDecodeError as exc: + raise InvalidPackError(f"{name} section is not valid UTF-8") from exc + + try: + payload = json.loads(text) + except json.JSONDecodeError as exc: + raise InvalidPackError(f"{name} section is not valid JSON") from exc + + return payload, offset + + +def _read_u32(view: memoryview, offset: int) -> tuple[int, int]: + if offset + 4 > len(view): + raise InvalidPackError("unexpected end of buffer") + try: + (value,) = _UINT32.unpack_from(view, offset) + except struct.error as exc: # pragma: no cover - defensive + raise InvalidPackError("unexpected end of buffer") from exc + return value, offset + 4 + + +def _read_u32_array(view: memoryview, offset: int, length: int) -> list[int]: + if length > (len(view) - offset) // 4: + raise InvalidPackError("unexpected end of buffer") + values: list[int] = [] + for _ in range(length): + value, offset = _read_u32(view, offset) + values.append(value) + return values + + +def _optional_str(value: Any) -> str | None: + return value if isinstance(value, str) else None + + +def _optional_int(value: Any, *, minimum: int | None = None) -> int | None: + if not _is_int(value): + return None + if minimum is not None and value < minimum: + return None + return value + + +def _require_int(value: Any, field_name: str, *, minimum: int | None = None) -> int: + if not _is_int(value): + raise InvalidPackError(f"{field_name} must be an integer") + if minimum is not None and value < minimum: + raise InvalidPackError(f"{field_name} must be >= {minimum}") + return value + + +def _require_float(value: Any, field_name: str, *, minimum: float | None = None) -> float: + if isinstance(value, bool) or not isinstance(value, (int, float)): + raise InvalidPackError(f"{field_name} must be a number") + out = float(value) + if not math.isfinite(out): + raise InvalidPackError(f"{field_name} must be finite") + if minimum is not None and out < minimum: + raise InvalidPackError(f"{field_name} must be >= {minimum}") + return out + + +def _is_int(value: Any) -> bool: + return isinstance(value, int) and not isinstance(value, bool) + + +def _is_positive_int(value: Any) -> bool: + return _is_int(value) and value > 0 + + +def _is_non_negative_finite_number(value: Any) -> bool: + if isinstance(value, bool) or not isinstance(value, (int, float)): + return False + out = float(value) + return math.isfinite(out) and out >= 0 diff --git a/packages/core-python/src/knolo/tokenize.py b/packages/core-python/src/knolo/tokenize.py new file mode 100644 index 0000000..f067754 --- /dev/null +++ b/packages/core-python/src/knolo/tokenize.py @@ -0,0 +1,21 @@ +from __future__ import annotations + + +def normalize(text: str) -> str: + """Lowercase and trim text without the richer TypeScript normalization.""" + return text.lower().strip() + + +def tokenize(text: str) -> list[str]: + """Split text on non-alphanumeric characters and lowercase each token.""" + tokens: list[str] = [] + current: list[str] = [] + for ch in text: + if ch.isalnum(): + current.append(ch.lower()) + elif current: + tokens.append("".join(current)) + current.clear() + if current: + tokens.append("".join(current)) + return tokens diff --git a/packages/core-python/tests/fixtures/README.md b/packages/core-python/tests/fixtures/README.md new file mode 100644 index 0000000..ba8b0bd --- /dev/null +++ b/packages/core-python/tests/fixtures/README.md @@ -0,0 +1,21 @@ +# Fixture Regeneration + +`simple.knolo` is the committed binary fixture used by the Python tests. + +It is generated from the checked-in corpus files: + +- `corpus/intro.md` +- `corpus/runtime.md` +- `corpus/other.md` + +The root helper script `scripts/regenerate-python-fixture.mjs` rebuilds the fixture with the existing `@knolo/core` TypeScript builder. + +Tests mount the committed binary directly, so Node.js is only needed when regenerating the fixture, not at runtime. + +From the repo root: + +```bash +node scripts/regenerate-python-fixture.mjs +``` + +Pass `--check` to verify that the working tree bytes still match the corpus without rewriting the file. diff --git a/packages/core-python/tests/fixtures/corpus/intro.md b/packages/core-python/tests/fixtures/corpus/intro.md new file mode 100644 index 0000000..76bfa99 --- /dev/null +++ b/packages/core-python/tests/fixtures/corpus/intro.md @@ -0,0 +1,3 @@ +# Alpha Intro + +alpha beta diff --git a/packages/core-python/tests/fixtures/corpus/other.md b/packages/core-python/tests/fixtures/corpus/other.md new file mode 100644 index 0000000..9a40f07 --- /dev/null +++ b/packages/core-python/tests/fixtures/corpus/other.md @@ -0,0 +1,3 @@ +# Alpha Reference + +alpha beta diff --git a/packages/core-python/tests/fixtures/corpus/runtime.md b/packages/core-python/tests/fixtures/corpus/runtime.md new file mode 100644 index 0000000..a5f4e83 --- /dev/null +++ b/packages/core-python/tests/fixtures/corpus/runtime.md @@ -0,0 +1,3 @@ +# Beta Guide + +beta gamma delta diff --git a/packages/core-python/tests/fixtures/simple.knolo b/packages/core-python/tests/fixtures/simple.knolo new file mode 100644 index 0000000..5546e31 Binary files /dev/null and b/packages/core-python/tests/fixtures/simple.knolo differ diff --git a/packages/core-python/tests/test_runtime.py b/packages/core-python/tests/test_runtime.py new file mode 100644 index 0000000..30af294 --- /dev/null +++ b/packages/core-python/tests/test_runtime.py @@ -0,0 +1,147 @@ +from __future__ import annotations + +import importlib.util +import struct +import zipfile +from pathlib import Path + +import pytest + +from knolo import ( + InvalidPackError, + QueryOptions, + mount_pack, + mount_pack_from_bytes, + query, +) + + +FIXTURE_PATH = Path(__file__).resolve().parent / "fixtures" / "simple.knolo" +BUILD_BACKEND_PATH = Path(__file__).resolve().parents[1] / "setuptools" / "build_meta.py" + + +@pytest.fixture(scope="module") +def fixture_bytes() -> bytes: + return FIXTURE_PATH.read_bytes() + + +@pytest.fixture(scope="module") +def fixture_pack(fixture_bytes: bytes): + return mount_pack_from_bytes(fixture_bytes) + + +def _load_build_backend(): + spec = importlib.util.spec_from_file_location("knolo_local_build_meta", BUILD_BACKEND_PATH) + assert spec is not None + assert spec.loader is not None + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def test_mounts_from_path_and_bytes(fixture_bytes: bytes): + pack_from_path = mount_pack(FIXTURE_PATH) + pack_from_bytes = mount_pack_from_bytes(fixture_bytes) + + assert pack_from_path == pack_from_bytes + + +def test_preserves_metadata_and_block_fields(fixture_pack): + assert fixture_pack.meta.version == 3 + assert fixture_pack.meta.stats.docs == 3 + assert fixture_pack.meta.stats.blocks == 3 + assert fixture_pack.meta.stats.terms == 4 + assert fixture_pack.blocks == ("alpha beta", "beta gamma delta", "alpha beta") + assert fixture_pack.headings == ( + "Alpha Intro", + "Beta Guide", + "Alpha Reference", + ) + assert fixture_pack.doc_ids == ("intro.md", "runtime.md", "other.md") + assert fixture_pack.namespaces == ("docs.alpha", "docs.beta", "docs.alpha") + assert fixture_pack.block_token_lens == (2, 3, 2) + + +def test_query_is_deterministic_and_ranks_by_block_id_tie_breaker(fixture_pack): + hits = query(fixture_pack, "alpha beta", top_k=5) + + assert [hit.source for hit in hits[:2]] == ["intro.md", "other.md"] + assert hits[0].score == pytest.approx(hits[1].score) + assert hits[0].block_id < hits[1].block_id + + +def test_query_supports_namespace_and_source_filters(fixture_pack): + namespace_hits = query(fixture_pack, "alpha", namespace="docs.alpha", top_k=5) + assert [hit.source for hit in namespace_hits] == ["intro.md", "other.md"] + + source_hits = query(fixture_pack, "alpha", source="other.md", top_k=5) + assert [hit.source for hit in source_hits] == ["other.md"] + + +def test_blank_query_returns_empty_list(fixture_pack): + assert query(fixture_pack, "") == [] + assert query(fixture_pack, " ") == [] + + +def test_top_k_limits_results(fixture_pack): + hits = query(fixture_pack, "beta", top_k=1) + assert len(hits) == 1 + assert hits[0].source == "intro.md" + + +def test_min_score_filters_results(fixture_pack): + assert query(fixture_pack, "alpha", min_score=10.0) == [] + + +def test_query_options_are_merged_with_explicit_kwargs(fixture_pack): + options = QueryOptions(top_k=1, namespace="docs.alpha") + hits = query(fixture_pack, "alpha beta", options, top_k=2) + assert len(hits) == 2 + assert all(hit.namespace == "docs.alpha" for hit in hits) + + +def test_non_editable_wheel_uses_top_level_package_paths(tmp_path): + build_meta = _load_build_backend() + wheel_name = build_meta.build_wheel(tmp_path) + wheel_path = tmp_path / wheel_name + + assert wheel_path.exists() + + with zipfile.ZipFile(wheel_path) as wheel: + names = wheel.namelist() + + assert "knolo/__init__.py" in names + assert "knolo/errors.py" in names + assert "knolo/models.py" in names + assert "knolo/runtime.py" in names + assert "knolo/tokenize.py" in names + assert "knolo/py.typed" in names + assert not any(name.startswith("src/knolo/") for name in names) + + +@pytest.mark.parametrize( + "payload", + [ + b"not-json-at-all", + struct.pack("