diff --git a/CLAUDE.md b/CLAUDE.md index 7f5b7d8..d546447 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -43,15 +43,18 @@ pre-commit run --all-files ## CI -GitHub Actions runs on push to `main` and all PRs. Five parallel jobs: +GitHub Actions runs on push to `main` and all PRs. Four parallel required checks, then downstream gates: - **lint** — `ruff check` + `ruff format --check` - **typecheck** — `mypy --strict src/navi_sanitize/` -- **test** — pytest across Python 3.12 + 3.13, `--benchmark-disable` +- **test** — pytest across Python 3.12 + 3.13, `--benchmark-disable` (matrix → aggregator) - **security** — `pip-audit` dependency vulnerability scan -- **build** — gates on all four above; builds wheel, smoke-tests public API, uploads artifact +- **quality-gate** — gates on all four above (org ruleset required check) +- **build** — gates on all four required checks; builds wheel, smoke-tests imports (`clean`, `walk`, `jinja2_escaper`, `path_escaper`), uploads artifact -Additional security workflows: Semgrep SAST, CodeQL (`python` + `actions`), OpenSSF Scorecard. +Additional security workflows: Semgrep SAST, CodeQL (`python` + `actions` via org GHAS), OpenSSF Scorecard. + +**Fuzz testing** (`.github/workflows/fuzz.yml`) — Atheris fuzzing of `fuzz_clean` and `fuzz_walk` targets, runs on push/PR and weekly schedule (Wednesday 03:00 UTC). Uploads crash artifacts on failure. Benchmarks run via manual dispatch only (`.github/workflows/benchmark.yml`). @@ -68,17 +71,17 @@ Eight exports: `clean(text, *, escaper=None) -> str`, `walk(data, *, escaper=Non Six stages in strict order — reordering breaks security: 1. **Null byte removal** — strip `\x00` (prevents C-extension truncation) -2. **Invisible character stripping** — single compiled regex covering zero-width chars, format/control chars, variation selectors, Unicode Tag block (`U+E0000`-`U+E007F`), and bidi overrides +2. **Invisible character stripping** — single compiled regex covering 492 chars across 9 categories: zero-width, format/control, variation selectors, variation selector supplement, Mongolian FVS, Unicode Tag block (`U+E0000`-`U+E007F`), bidirectional controls, C0 controls, and C1 controls 3. **NFKC normalization** — collapses fullwidth ASCII and compatibility forms 4. **Homoglyph replacement** — NFD decomposition then character-by-character scan against 66-pair map in `_homoglyphs.py` 5. **Re-NFKC** (conditional) — re-normalize after homoglyph replacement to ensure idempotency 6. **Escaper** (optional) — pluggable `Callable[[str], str]` runs last -Each stage returns `(cleaned_string, count_or_flag)` — either an `int` count of removals/replacements or a `bool` changed flag. Stages have no side effects — the orchestrator logs. +Stages 1–5 each return `(cleaned_string, count)` where `count` is an `int` for removals, replacements, or normalization changes. Stage 6 (escaper) is a `Callable[[str], str]` that returns a bare `str`. Stages have no side effects — the orchestrator logs. ### Data files -- `_homoglyphs.py` — 66 pairs: Cyrillic, Greek, Armenian, Cherokee, and typographic lookalikes +- `_homoglyphs.py` — 66 pairs: Cyrillic, Greek, Armenian, Cherokee, Cyrillic Extended, Latin Extended, and typographic lookalikes - `_invisible.py` — zero-width, format/control (soft hyphen, thin/hair space, line/paragraph separators, etc.), variation selectors, variation selector supplement, Mongolian FVS, Unicode Tag block, bidirectional controls, C0 controls, and C1 controls ### Escapers (`escapers/`) diff --git a/README.md b/README.md index 6b8aae2..b51100a 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,11 @@ [![Python 3.12+](https://img.shields.io/badge/python-3.12%2B-blue.svg)](https://www.python.org/downloads/) [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) -Deterministic input sanitization for untrusted text. Zero dependencies. Legitimate Unicode preserved by design. +Deterministic input sanitization for untrusted text — invisible characters, homoglyphs, and encoding tricks, handled before your code sees them. Zero dependencies, no ML. Legitimate Unicode preserved by design. + +``` +pip install navi-sanitize +``` **[Documentation](https://project-navi.github.io/navi-sanitize/)** · [Getting Started](https://project-navi.github.io/navi-sanitize/getting-started/quickstart/) · [API Reference](https://project-navi.github.io/navi-sanitize/reference/api/) · [Threat Model](https://project-navi.github.io/navi-sanitize/explanation/threat-model/) @@ -34,19 +38,21 @@ Opt-in utilities for deeper analysis: `decode_evasion()` peels nested URL/HTML/h ## Why This Matters -Untrusted text contains invisible attacks: homoglyph substitution, zero-width characters, null bytes, fullwidth encoding, template/prompt injection delimiters. These bypass validation, poison templates, and fool humans. +Untrusted text contains invisible attacks: homoglyph substitution, zero-width characters, null bytes, fullwidth encoding, template/prompt injection delimiters. These bypass validation, poison templates, and fool humans. Framework validators handle format and type — they don't handle Unicode deception. That's what this library is for. + +navi-sanitize fixes the text before it reaches your application. It doesn't detect attacks — it removes them. Implements the NFKC + zero-width + control character pipeline recommended by the [OWASP LLM Prompt Injection Prevention Cheat Sheet](https://cheatsheetseries.owasp.org/cheatsheets/LLM_Prompt_Injection_Prevention_Cheat_Sheet.html). -navi-sanitize fixes the text before it reaches your application. It doesn't detect attacks — it removes them. +**LLM prompt pipelines** — Character-level attacks bypass LLM guardrails at [64–67% success rates](https://arxiv.org/html/2504.11168v1). Invisible Unicode encodes instructions tokenizers read but humans can't see. Homoglyphs bypass keyword filters. Sanitize before the model sees it. -**LLM prompt pipelines** — User input flows into system prompts, RAG context, and tool calls. Invisible Unicode (tag block characters, bidi overrides) encodes instructions that tokenizers read but humans can't see. Homoglyphs bypass keyword filters. navi-sanitize strips these vectors before text reaches the model, and the pluggable escaper lets you add vendor-specific prompt escaping on top. +**Web applications** — A single `clean(user_input, escaper=jinja2_escaper)` call handles homoglyph-disguised SSTI payloads like `{{ cоnfig }}` (Cyrillic `о`) that naive escaping misses. -**Web applications** — Jinja2 SSTI, path traversal, and fullwidth encoding bypasses are well-known but tedious to cover manually. A single `clean(user_input, escaper=jinja2_escaper)` call handles homoglyph-disguised payloads like `{{ cоnfig }}` (Cyrillic `о`) that naive escaping misses. +**Identity and anti-phishing** — `pаypal.com` (Cyrillic `а`) renders identically to `paypal.com`. The only maintained Python homoglyph replacement library — both [confusable_homoglyphs](https://github.com/vhf/confusable_homoglyphs) and [homoglyphs](https://github.com/life4/homoglyphs) are archived. -**Identity and anti-phishing** — `pаypal.com` (Cyrillic `а`) renders identically to `paypal.com` in most fonts. Homoglyph replacement normalizes display names, URLs, and email addresses to catch spoofing that visual inspection misses. +**Log analysis** — Bidi overrides and zero-width chars hide IOCs from analysts. Sanitize on ingest so search matches reality. -**Log analysis and SIEM** — Attackers embed bidi overrides and zero-width characters in log entries to hide indicators of compromise from analysts and pattern-matching tools. Sanitizing log data on ingest ensures what you search is what's actually there. +**Config ingestion** — Null bytes truncate C-extension processing, zero-width chars break key matching. `walk(parsed_config)` sanitizes every string in a nested structure in one call. -**Config and data ingestion** — YAML, TOML, and JSON parsed from untrusted sources can carry null bytes that truncate C-extension processing, zero-width characters that break key matching, and homoglyphs that create near-duplicate keys. `walk(parsed_config)` sanitizes every string in a nested structure in one call. +These aren't theoretical risks — [CVE-2024-43093](https://nvd.nist.gov/vuln/detail/CVE-2024-43093) was an actively exploited Android zero-day using the exact fullwidth character bypass this pipeline prevents. ## How It Compares @@ -59,10 +65,12 @@ navi-sanitize is the only library that combines invisible character stripping, h | **Homoglyphs** | Replaces 66 curated pairs | Transliterates all non-ASCII | Detects only (no replace) | No | No | | **NFKC** | Yes | No | No | NFC (NFKC optional) | No | | **Null bytes** | Yes | No | No | No | No | -| **Preserves Unicode** | Yes (CJK, Arabic, emoji intact) | No (destroys all non-ASCII) | Yes | Yes | Yes | +| **Preserves Unicode** | Yes (CJK, Arabic, emoji¹ intact) | No (destroys all non-ASCII) | Yes | Yes | Yes | | **Pluggable escaper** | Yes | No | No | No | N/A (HTML-specific) | | **Dependencies** | Zero | Zero | Zero | wcwidth | C ext / Rust ext | +¹ ZWJ (U+200D) is stripped as a zero-width character, which decomposes ZWJ emoji sequences (e.g. family emoji) into individual emoji. Single emoji are unaffected. Bidi formatting marks (U+061C, U+200E/F, etc.) used in Arabic/Hebrew are also stripped — correct rendering may require re-adding directional marks downstream. + **Key differences:** - **Unidecode / anyascii** transliterate *all* non-ASCII to Latin. They turn `"` into `"Zhong"` and Cyrillic sentences into gibberish. navi-sanitize normalizes only the 66 highest-risk lookalikes and leaves legitimate Unicode intact. @@ -78,9 +86,9 @@ Every string passes through stages in order. Each stage returns clean output and | Stage | What it does | |-------|-------------| | Null bytes | Strip `\x00` | -| Invisibles | Strip zero-width, Unicode Tag block, bidi controls | +| Invisibles | Strip zero-width, format/control, variation selectors, Unicode Tag block, bidi, C0/C1 | | NFKC | Normalize fullwidth ASCII to standard ASCII | -| Homoglyphs | Replace Cyrillic/Greek lookalikes with Latin equivalents | +| Homoglyphs | Replace Cyrillic/Greek/Armenian/Cherokee/typographic lookalikes with Latin equivalents | | Re-NFKC | Re-normalize after homoglyph replacement (ensures idempotency) | | **Escaper** | Pluggable — you choose what to escape for | @@ -145,12 +153,6 @@ template.render(**safe_context) See [examples/](examples/) for runnable scripts covering LLM pipelines, FastAPI/Pydantic, and log sanitization. -## Install - -``` -pip install navi-sanitize -``` - ## Walk untrusted data structures ```python @@ -215,17 +217,17 @@ clean("pаypal.com") ## Performance -Measured on Python 3.12, single thread. `clean()` is the per-string cost; `walk()` includes the iterative copy pass. +Measured on Python 3.13, single thread, AMD Ryzen 9 9950X. `clean()` is the per-string cost; `walk()` includes the iterative copy pass. Numbers are representative — expect ±20% on different hardware; CI runners are typically 2–3x slower. | Scenario | Mean | Ops/sec | |----------|------|---------| -| `clean()` — short, clean text (no-op) | 2.8 us | 358K | -| `clean()` — short, hostile (all stages fire) | 67 us | 15K | -| `clean()` — 13KB clean text | 810 us | 1.2K | -| `clean()` — 10KB hostile text | 449 us | 2.2K | -| `clean()` — 100KB hostile payload | 5.7 ms | 176 | -| `walk()` — 100-item nested dict, clean | 537 us | 1.9K | -| `walk()` — 100-item nested dict, hostile | 6.9 ms | 144 | +| `clean()` — short, clean text (no-op) | 1.1 µs | 905K | +| `clean()` — short, hostile (all stages fire) | 21 µs | 48K | +| `clean()` — 13KB clean text | 292 µs | 3.4K | +| `clean()` — 10KB hostile text | 305 µs | 3.3K | +| `clean()` — 100KB hostile payload | 3.5 ms | 286 | +| `walk()` — 100-item nested dict, clean | 311 µs | 3.2K | +| `walk()` — 100-item nested dict, hostile | 2.5 ms | 408 | ## License diff --git a/docs/explanation/comparison.md b/docs/explanation/comparison.md index 163696e..92c23cc 100644 --- a/docs/explanation/comparison.md +++ b/docs/explanation/comparison.md @@ -23,7 +23,9 @@ navi-sanitize is the only library that combines invisible character stripping, h **Why they're different:** Transliteration destroys content. Unidecode turns Chinese characters into pinyin, Cyrillic sentences into romanized gibberish, and Arabic into Latin approximations. It's designed for slug generation, not security. -navi-sanitize normalizes only the 66 highest-risk Latin lookalikes and leaves legitimate Unicode intact. CJK, Arabic, emoji, and non-confusable Cyrillic pass through unchanged. +navi-sanitize normalizes only the 66 highest-risk Latin lookalikes and leaves legitimate Unicode intact. CJK, Arabic, emoji,¹ and non-confusable Cyrillic pass through unchanged. + +¹ ZWJ (U+200D) is stripped as a zero-width character, which decomposes ZWJ emoji sequences (e.g. family emoji) into individual emoji. Single emoji are unaffected. | Input | navi-sanitize | Unidecode | |-------|--------------|-----------| @@ -118,7 +120,7 @@ class UserInput(BaseModel): The Unicode Consortium's `Confusables.txt` contains thousands of pairs across many scripts. navi-sanitize uses a curated 66-pair subset focused on: 1. **Highest visual similarity** --- characters that are pixel-identical in common fonts -2. **Most commonly weaponized** --- Cyrillic/Greek-to-Latin pairs used in phishing and filter bypass +2. **Most commonly weaponized** --- Cyrillic/Greek/Armenian/Cherokee/typographic-to-Latin pairs used in phishing and filter bypass 3. **Typographic normalization** --- smart quotes, em/en dashes, minus signs This preserves legitimate Unicode while covering the attack surface that matters in practice. diff --git a/docs/explanation/performance.md b/docs/explanation/performance.md index 58a5af0..9801f28 100644 --- a/docs/explanation/performance.md +++ b/docs/explanation/performance.md @@ -1,6 +1,6 @@ # Performance -Benchmarks measured on Python 3.12, single thread. Run via `uv run pytest tests/test_benchmark.py -v`. +Benchmarks measured on Python 3.13, single thread, AMD Ryzen 9 9950X. Run via `uv run pytest tests/test_benchmark.py -v`. Numbers are representative --- expect ±20% on different hardware; CI runners are typically 2--3x slower. ## Benchmark Results @@ -8,18 +8,18 @@ Benchmarks measured on Python 3.12, single thread. Run via `uv run pytest tests/ | Scenario | Mean | Ops/sec | Description | |----------|------|---------|-------------| -| Short, clean text (no-op) | 2.8 us | 358K | ~38 chars, no stages fire | -| Short, hostile (all stages) | 67 us | 15K | ~27 chars with homoglyphs, null bytes, zero-width, template syntax | -| 13KB clean text | 810 us | 1.2K | Large clean input throughput | -| 10KB hostile text | 449 us | 2.2K | Large hostile input with repeated attack patterns | -| 100KB hostile payload | 5.7 ms | 176 | Stress test payload | +| Short, clean text (no-op) | 1.1 µs | 905K | ~38 chars, no stages fire | +| Short, hostile (all stages) | 21 µs | 48K | ~27 chars with homoglyphs, null bytes, zero-width, template syntax | +| 13KB clean text | 292 µs | 3.4K | Large clean input throughput | +| 10KB hostile text | 305 µs | 3.3K | Large hostile input with repeated attack patterns | +| 100KB hostile payload | 3.5 ms | 286 | Stress test payload | ### `walk()` --- Recursive Structure Cost | Scenario | Mean | Ops/sec | Description | |----------|------|---------|-------------| -| 100-item nested dict, clean | 537 us | 1.9K | `deepcopy` + traversal overhead, no stages fire | -| 100-item nested dict, hostile | 6.9 ms | 144 | `deepcopy` + full pipeline on every string | +| 100-item nested dict, clean | 311 µs | 3.2K | Iterative copy + traversal overhead, no stages fire | +| 100-item nested dict, hostile | 2.5 ms | 408 | Iterative copy + full pipeline on every string | ## When to Use `clean()` vs `walk()` @@ -31,7 +31,7 @@ Benchmarks measured on Python 3.12, single thread. Run via `uv run pytest tests/ | Nested config from untrusted source | `walk()` | | Hot path, single known string | `clean()` | -`walk()` adds `deepcopy` overhead to ensure the original data is never modified. If you're already working with a copy or don't need immutability, you can call `clean()` on individual strings for better performance. +`walk()` adds iterative copy overhead to ensure the original data is never modified. If you're already working with a copy or don't need immutability, you can call `clean()` on individual strings for better performance. ## Performance Characteristics by Stage diff --git a/docs/explanation/pipeline-architecture.md b/docs/explanation/pipeline-architecture.md index 184cc2d..18b2fd7 100644 --- a/docs/explanation/pipeline-architecture.md +++ b/docs/explanation/pipeline-architecture.md @@ -1,6 +1,6 @@ # Pipeline Architecture -Every string passed to `clean()` flows through six stages in strict order. Each stage is a deterministic function that returns the cleaned string and a change indicator (a count for stages that strip or replace, a boolean for normalization). The pipeline orchestrator logs warnings when stages modify input. +Every string passed to `clean()` flows through six stages in strict order. Each of the five universal stages (1--5) is a deterministic function that returns the cleaned string and a change indicator (a count of affected codepoints). The escaper (stage 6, if provided) is a plain `str -> str` function. The pipeline orchestrator logs warnings when stages modify input. ## Data Flow @@ -24,7 +24,7 @@ Input string │ ▼ ┌─────────────────────┐ -│ 4. Homoglyph Replace│ Cyrillic/Greek → Latin +│ 4. Homoglyph Replace│ Cyrillic/Greek/Armenian/Cherokee/typographic → Latin └─────────┬───────────┘ │ ▼ diff --git a/docs/explanation/threat-model.md b/docs/explanation/threat-model.md index 3e8c14a..f735402 100644 --- a/docs/explanation/threat-model.md +++ b/docs/explanation/threat-model.md @@ -5,7 +5,9 @@ navi-sanitize is a deterministic text sanitization library. It transforms untrus ## Design Philosophy 1. **Deterministic** --- same input, same output, every time. No ML models, no heuristics, no confidence scores. -2. **Legitimate Unicode preserved** --- CJK, Arabic, Hebrew, emoji, and non-confusable text pass through unchanged. A string that passes through unmodified was already clean. +2. **Legitimate Unicode preserved** --- CJK, Arabic, Hebrew, emoji,¹ and non-confusable text pass through unchanged. A string that passes through unmodified was already clean. + +¹ ZWJ (U+200D) is stripped as a zero-width character, decomposing ZWJ emoji sequences into individual emoji. Bidi formatting marks (U+061C, U+200E/F, etc.) are also stripped — see [Stripping Arabic Letter Mark](#stripping-arabic-letter-mark-and-mongolian-fvs) below. 3. **Always returns output** --- never throws on bad input (except `TypeError` for non-strings). Attackers can't cause denial of service by crafting inputs that error. 4. **Pluggable** --- the universal pipeline handles common vectors; escapers handle context-specific threats. @@ -117,7 +119,7 @@ re-inserts them from a trusted source. Characters like ᴀᴅᴍɪɴ (Latin Small Capitals, U+1D00--U+1D22) and ɑ (Latin Small Letter Alpha, U+0251) are visually similar to standard Latin letters but are classified as Latin script by -Unicode. The homoglyph map targets cross-script confusables (Cyrillic, Greek, Armenian, Cherokee) +Unicode. The homoglyph map targets cross-script confusables (Cyrillic, Greek, Armenian, Cherokee, Latin Extended, and typographic) where the script mismatch is the attack signal. Latin-to-Latin visual similarity is a different threat model better served by `detect_scripts()` and `is_mixed_script()` --- or by application-level character allowlisting for high-security contexts like username registration. diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index 3816346..571d49e 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -114,7 +114,7 @@ clean("pаypal.com") Warnings include counts for traceability: - `"Removed 2 null byte(s) from value"` - `"Stripped 3 invisible character(s) from value"` -- `"Normalized fullwidth character(s) in value"` +- `"Normalized 1 fullwidth/compatibility character(s) in value"` - `"Replaced 1 homoglyph(s) in value"` To capture warnings programmatically: diff --git a/docs/index.md b/docs/index.md index 48c9aa5..5a2680a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,9 +6,9 @@ hide: # navi-sanitize -**Deterministic input sanitization for untrusted text.** Zero dependencies. No ML. Legitimate Unicode preserved by design. +**Deterministic input sanitization for untrusted text** --- invisible characters, homoglyphs, and encoding tricks, handled before your code sees them. Zero dependencies, no ML. Legitimate Unicode preserved by design. -navi-sanitize removes invisible attacks from untrusted text before it reaches your application. It doesn't detect attacks --- it removes them. Every input produces clean output, every time. +navi-sanitize removes invisible attacks from untrusted text before it reaches your application. It doesn't detect attacks --- it removes them. Implements the pipeline recommended by the [OWASP LLM Prompt Injection Prevention Cheat Sheet](https://cheatsheetseries.owasp.org/cheatsheets/LLM_Prompt_Injection_Prevention_Cheat_Sheet.html). [Get Started](getting-started/quickstart.md){ .md-button .md-button--primary } [API Reference](reference/api.md){ .md-button } @@ -28,8 +28,10 @@ clean(evil) # "systemprompt" — hidden chars stripped ## Features - **6-stage pipeline** --- null bytes, invisible characters, NFKC normalization, homoglyph replacement, re-NFKC for idempotency, pluggable escaper +- **OWASP aligned** --- implements the NFKC + zero-width + control character sanitization recommended by the LLM Prompt Injection Prevention Cheat Sheet +- **Only maintained option** --- both [confusable_homoglyphs](https://github.com/vhf/confusable_homoglyphs) and [homoglyphs](https://github.com/life4/homoglyphs) are archived; navi-sanitize is the only maintained Python library covering homoglyph replacement - **Deterministic** --- same input always produces the same output; no probabilistic models, no heuristics -- **Zero dependencies** --- Python 3.12+ stdlib only +- **Zero dependencies** --- Python 3.12+ stdlib only; no third-party dependency risk - **Pluggable escapers** --- built-in Jinja2 and path traversal escapers; write your own in three lines - **Recursive sanitization** --- `walk()` sanitizes every string in nested dicts and lists - **Transparent logging** --- warnings include counts ("Stripped 3 invisible character(s)") diff --git a/src/navi_sanitize/_pipeline.py b/src/navi_sanitize/_pipeline.py index d46e8c6..535862a 100644 --- a/src/navi_sanitize/_pipeline.py +++ b/src/navi_sanitize/_pipeline.py @@ -15,6 +15,7 @@ import logging import unicodedata from collections.abc import Callable +from typing import cast from navi_sanitize._homoglyphs import HOMOGLYPH_MAP from navi_sanitize._invisible import INVISIBLE_RE @@ -40,10 +41,23 @@ def _strip_invisible(s: str) -> tuple[str, int]: return s, 0 -def _normalize_nfkc(s: str) -> tuple[str, bool]: - """NFKC normalize. Returns (cleaned, changed).""" +def _normalize_nfkc(s: str) -> tuple[str, int]: + """NFKC normalize. Returns (cleaned, approximate count of affected codepoints). + + Count is derived from whole-string comparison: positions that differ + between original and normalized (up to shorter length) plus any length + change from composition or decomposition. This avoids the per-character + normalization pitfall where combining-mark sequences (e.g. e + combining + acute) compose to a single precomposed codepoint but each constituent + appears unchanged when normalized in isolation. + """ normalized = unicodedata.normalize("NFKC", s) - return normalized, normalized != s + if normalized == s: + return s, 0 + min_len = min(len(s), len(normalized)) + n = sum(1 for i in range(min_len) if s[i] != normalized[i]) + n += abs(len(s) - len(normalized)) + return normalized, n def _replace_homoglyphs(s: str) -> tuple[str, int]: @@ -100,9 +114,9 @@ def clean(text: str, *, escaper: Escaper | None = None) -> str: logger.warning("Stripped %d invisible character(s) from value", invis_count) # Stage 3: NFKC normalization - text, had_nfkc = _normalize_nfkc(text) - if had_nfkc: - logger.warning("Normalized fullwidth character(s) in value") + text, nfkc_count = _normalize_nfkc(text) + if nfkc_count: + logger.warning("Normalized %d fullwidth/compatibility character(s) in value", nfkc_count) # Stage 4: Homoglyphs text, glyph_count = _replace_homoglyphs(text) @@ -143,7 +157,9 @@ def walk[T](data: T, *, escaper: Escaper | None = None, max_depth: int = 128) -> # Scalars and strings — no traversal needed if isinstance(data, str): - return clean(data, escaper=escaper) # type: ignore[return-value] + # clean() returns str; T is str (or a subclass whose extra semantics + # sanitization intentionally discards). Mypy can't prove str <: T. + return cast(T, clean(data, escaper=escaper)) if not isinstance(data, (dict, list)): return data @@ -216,4 +232,6 @@ def _resolve(v: object, depth: int) -> object: for item in orig_l: copy_l.append(_resolve(item, depth)) - return result # type: ignore[return-value] + # result is the root copy (dict or list) — structurally T but typed as + # object because the stack-based builder can't carry T through. + return cast(T, result) diff --git a/tests/test_clean.py b/tests/test_clean.py index 0978a84..fbc04d5 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -84,6 +84,29 @@ def test_warns_on_fullwidth(self, caplog: pytest.LogCaptureFixture) -> None: clean("\uff54est") assert "fullwidth" in caplog.text.lower() or "normalized" in caplog.text.lower() + def test_warns_with_nfkc_count(self, caplog: pytest.LogCaptureFixture) -> None: + from navi_sanitize import clean + + with caplog.at_level(logging.WARNING, logger="navi_sanitize"): + clean("\uff41\uff42\uff43") # 3 fullwidth chars + assert "3 fullwidth/compatibility" in caplog.text + + def test_warns_with_single_nfkc_count(self, caplog: pytest.LogCaptureFixture) -> None: + from navi_sanitize import clean + + with caplog.at_level(logging.WARNING, logger="navi_sanitize"): + clean("\uff41bc") # 1 fullwidth char + assert "1 fullwidth/compatibility" in caplog.text + + def test_re_nfkc_fires_after_homoglyph_replacement(self) -> None: + """Stage 5 re-NFKC: Greek U+03A5 + combining tilde -> Y + tilde -> U+1EF8.""" + from navi_sanitize import clean + + # Greek U+03A5 is replaced with Latin Y by stage 4. + # Y + combining tilde (U+0303) then composes to U+1EF8 in stage 5. + result = clean("\u03a5\u0303") + assert result == "\u1ef8" + class TestHomoglyphReplacement: def test_replaces_cyrillic_a(self) -> None: