feat: add Brazilian Portuguese support to entity_detector

mvalentsev · mvalentsev · commit 4f05ed57c8ff · 2026-04-09T23:04:03.000+05:00
Closes #117. Extends the detector so a file written in pt-br is treated the same way a file in English is: names are extracted as candidates, and verb / pronoun / dialogue / direct-address patterns contribute to the person vs project classification. Pure-English corpora are unchanged because every addition is additive. The concrete pieces: - New PERSON_VERB_PATTERNS_PTBR, PRONOUN_PATTERNS_PTBR, DIALOGUE_PATTERNS_PTBR constants with the Portuguese equivalents of the existing English signals (said / asked / replied / thinks / wants, plus greetings oi / olá / obrigado / caro). - _build_patterns concatenates the English and pt-br lists for the dialogue and person-verb buckets, so _every_ compiled matcher for an entity now covers both languages. - score_entity merges the English and pt-br pronoun lists for the proximity check. - extract_candidates widens its Latin-1 character class so accented names like João, Inês, Ângela, and André flow through candidate extraction instead of being silently dropped by an ASCII-only regex. - STOPWORDS adds the Portuguese greeting fillers (oi, olá, obrigado, obrigada, caro, cara) so they do not masquerade as entity candidates when they start sentences. The new tests/test_entity_detector.py covers English regression, pt-br person verbs (with a direct _build_patterns assertion so the signal source is unambiguous), pt-br pronoun proximity, direct address, a mixed-language corpus compared against English-only, Portuguese dialogue markers in quoted speech, and end-to-end detect_entities runs for both ASCII (Maria) and accented (João, Inês) names.
diff --git a/mempalace/entity_detector.py b/mempalace/entity_detector.py
@@ -68,6 +68,47 @@
     r'"{name}\s+said',
 ]
 
+# Brazilian Portuguese (pt-br) localisations — issue #117.
+# Additive on top of the English patterns so detection still works on mixed
+# English / Portuguese corpora without having to classify files by language.
+PERSON_VERB_PATTERNS_PTBR = [
+    r"\b{name}\s+disse\b",  # said
+    r"\b{name}\s+perguntou\b",  # asked
+    r"\b{name}\s+respondeu\b",  # replied
+    r"\b{name}\s+contou\b",  # told
+    r"\b{name}\s+riu\b",  # laughed
+    r"\b{name}\s+sorriu\b",  # smiled
+    r"\b{name}\s+chorou\b",  # cried
+    r"\b{name}\s+sentiu\b",  # felt
+    r"\b{name}\s+pensa\b",  # thinks
+    r"\b{name}\s+quer\b",  # wants
+    r"\b{name}\s+ama\b",  # loves
+    r"\b{name}\s+odeia\b",  # hates
+    r"\b{name}\s+sabe\b",  # knows
+    r"\b{name}\s+decidiu\b",  # decided
+    r"\b{name}\s+escreveu\b",  # wrote
+    r"\boi\s+{name}\b",  # hi
+    r"\bol[áa]\s+{name}\b",  # hello
+    r"\bobrigad[oa]\s+{name}\b",  # thanks
+    r"\bcaro\s+{name}\b",  # dear
+    r"\bcara\s+{name}\b",  # dear (feminine)
+]
+
+PRONOUN_PATTERNS_PTBR = [
+    r"\bela\b",
+    r"\bdela\b",
+    r"\bele\b",
+    r"\bdele\b",
+    r"\beles\b",
+    r"\belas\b",
+    r"\bdeles\b",
+    r"\bdelas\b",
+]
+
+DIALOGUE_PATTERNS_PTBR = [
+    r'"{name}\s+disse',
+]
+
 # Project signals — things projects have/do
 PROJECT_VERB_PATTERNS = [
     r"\bbuilding\s+{name}\b",
@@ -319,6 +360,14 @@
     "right",
     "let",
     "ok",
+    # Brazilian Portuguese greetings and filler words
+    "oi",
+    "ola",
+    "olá",
+    "obrigado",
+    "obrigada",
+    "caro",
+    "cara",
     # UI/action words that appear in how-to content
     "click",
     "hit",
@@ -445,16 +494,18 @@ def extract_candidates(text: str) -> dict:
     Extract all capitalized proper noun candidates from text.
     Returns {name: frequency} for names appearing 3+ times.
     """
-    # Find all capitalized words (not at sentence start — harder, so we use frequency as filter)
-    raw = re.findall(r"\b([A-Z][a-z]{1,19})\b", text)
+    # Find all capitalized words (not at sentence start — harder, so we use frequency as filter).
+    # The character classes include Latin-1 supplement so accented names like
+    # João, Inês, Ângela, and André are picked up alongside ASCII names.
+    raw = re.findall(r"\b([A-ZÀ-ÖØ-Þ][a-zà-öø-ÿ]{1,19})\b", text)
 
     counts = defaultdict(int)
     for word in raw:
         if word.lower() not in STOPWORDS and len(word) > 1:
             counts[word] += 1
 
     # Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code")
-    multi = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text)
+    multi = re.findall(r"\b([A-ZÀ-ÖØ-Þ][a-zà-öø-ÿ]+(?:\s+[A-ZÀ-ÖØ-Þ][a-zà-öø-ÿ]+)+)\b", text)
     for phrase in multi:
         if not any(w.lower() in STOPWORDS for w in phrase.split()):
             counts[phrase] += 1
@@ -469,15 +520,19 @@ def extract_candidates(text: str) -> dict:
 def _build_patterns(name: str) -> dict:
     """Pre-compile all regex patterns for a single entity name."""
     n = re.escape(name)
+    dialogue = DIALOGUE_PATTERNS + DIALOGUE_PATTERNS_PTBR
+    person_verbs = PERSON_VERB_PATTERNS + PERSON_VERB_PATTERNS_PTBR
     return {
-        "dialogue": [
-            re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in DIALOGUE_PATTERNS
-        ],
-        "person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in PERSON_VERB_PATTERNS],
+        "dialogue": [re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in dialogue],
+        "person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in person_verbs],
         "project_verbs": [
             re.compile(p.format(name=n), re.IGNORECASE) for p in PROJECT_VERB_PATTERNS
         ],
-        "direct": re.compile(rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b", re.IGNORECASE),
+        "direct": re.compile(
+            rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b"
+            rf"|\boi\s+{n}\b|\bol[áa]\s+{n}\b|\bobrigad[oa]\s+{n}\b",
+            re.IGNORECASE,
+        ),
         "versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
         "code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
     }
@@ -514,9 +569,10 @@ def score_entity(name: str, text: str, lines: list) -> dict:
     name_lower = name.lower()
     name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
     pronoun_hits = 0
+    all_pronoun_patterns = PRONOUN_PATTERNS + PRONOUN_PATTERNS_PTBR
     for idx in name_line_indices:
         window_text = " ".join(lines[max(0, idx - 2) : idx + 3]).lower()
-        for pronoun_pattern in PRONOUN_PATTERNS:
+        for pronoun_pattern in all_pronoun_patterns:
             if re.search(pronoun_pattern, window_text):
                 pronoun_hits += 1
                 break
diff --git a/tests/test_entity_detector.py b/tests/test_entity_detector.py
@@ -1,11 +1,15 @@
 """Tests for mempalace.entity_detector."""
 
 import os
+import shutil
+import tempfile
+from pathlib import Path
 from unittest.mock import patch
 
 from mempalace.entity_detector import (
     PROSE_EXTENSIONS,
     STOPWORDS,
+    _build_patterns,
     _print_entity_list,
     classify_entity,
     confirm_entities,
@@ -378,3 +382,125 @@ def test_scan_for_detection_max_files(tmp_path):
         (tmp_path / f"note{i}.md").write_text(f"content {i}")
     files = scan_for_detection(str(tmp_path), max_files=5)
     assert len(files) <= 5
+
+
+# ── Brazilian Portuguese (pt-br) tests — PR #156 ─────────────────────────────
+
+
+def test_scores_english_person_verbs():
+    text = (
+        "Maria said hello to the team.\n"
+        "Maria asked about the release.\n"
+        "Maria decided to approve it.\n"
+        "Maria wrote the spec. Maria wrote the email.\n"
+    )
+    scores = score_entity("Maria", text, text.splitlines())
+    assert scores["person_score"] > 0
+    assert any("action" in s for s in scores["person_signals"])
+
+
+def test_scores_portuguese_person_verbs():
+    text = (
+        "Maria disse oi para o time.\n"
+        "Maria perguntou sobre o lançamento.\n"
+        "Maria decidiu aprovar.\n"
+        "Depois Maria escreveu a spec. Maria escreveu o email.\n"
+    )
+    scores = score_entity("Maria", text, text.splitlines())
+    assert scores["person_score"] > 0
+    assert any("action" in s for s in scores["person_signals"])
+
+    patterns = _build_patterns("Maria")
+    assert any(rx.search("Maria disse oi") for rx in patterns["person_verbs"])
+    assert any(rx.search("Maria perguntou algo") for rx in patterns["person_verbs"])
+    assert any(rx.search("Maria decidiu aprovar") for rx in patterns["person_verbs"])
+
+
+def test_portuguese_pronoun_proximity():
+    text = (
+        "Maria estava no escritório hoje.\n"
+        "Ela pediu café e começou a revisar o código.\n"
+        "Dela veio a sugestão de migrar para Postgres.\n"
+        "Maria estava no escritório hoje.\n"
+        "Ela pediu café e começou a revisar o código.\n"
+        "Dela veio a sugestão de migrar para Postgres.\n"
+    )
+    scores = score_entity("Maria", text, text.splitlines())
+    assert any("pronoun" in s for s in scores["person_signals"])
+
+
+def test_portuguese_direct_address():
+    text = "oi Maria, tudo bem? obrigado Maria. olá Maria, chegou o relatório."
+    patterns = _build_patterns("Maria")
+    assert len(patterns["direct"].findall(text)) == 3
+
+    scores = score_entity("Maria", text, text.splitlines())
+    assert scores["person_score"] >= 12
+
+
+def test_mixed_english_portuguese_corpus():
+    english_only = (
+        "Maria said hello.\n"
+        "Maria asked about the release.\n"
+        "Maria decided to ship.\n"
+        "Maria wrote the note.\n"
+        "Maria wrote the note.\n"
+    )
+    mixed = (
+        "Maria said hello. Maria disse oi.\n"
+        "Maria asked about the release. Maria perguntou sobre o lançamento.\n"
+        "Maria decided to ship. Maria decidiu entregar.\n"
+        "Maria wrote the note. Maria escreveu a nota.\n"
+        "Maria wrote the note. Maria escreveu a nota.\n"
+    )
+    english_scores = score_entity("Maria", english_only, english_only.splitlines())
+    mixed_scores = score_entity("Maria", mixed, mixed.splitlines())
+    assert mixed_scores["person_score"] > english_scores["person_score"]
+
+
+def test_portuguese_dialogue_marker_in_quoted_text():
+    text = '"Maria disse que o deploy rodou bem."\n"Maria disse que está tudo OK."\n'
+    scores = score_entity("Maria", text, text.splitlines())
+    assert any("dialogue" in s for s in scores["person_signals"])
+
+
+def test_detect_entities_finds_portuguese_person():
+    text = (
+        "Maria disse que o deploy foi bem.\n"
+        "Depois Maria perguntou sobre o backend.\n"
+        "Maria decidiu aprovar a migração.\n"
+        "Maria escreveu a documentação final.\n"
+        "Ela é a nova líder do time.\n"
+    )
+    tmpdir = tempfile.mkdtemp()
+    try:
+        file_path = Path(tmpdir) / "reuniao.md"
+        file_path.write_text(text, encoding="utf-8")
+        detected = detect_entities([file_path])
+        all_names = [e["name"] for e in detected["people"] + detected["uncertain"]]
+        assert "Maria" in all_names
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+def test_detect_entities_picks_up_accented_names():
+    text = (
+        "João é o líder do time.\n"
+        "João disse que o lançamento foi bem.\n"
+        "João perguntou sobre o backend novo.\n"
+        "Depois João escreveu o relatório.\n"
+        "Inês entrou no projeto semana passada.\n"
+        "Inês disse que prefere Postgres.\n"
+        "Inês escreveu a documentação da API.\n"
+        "Inês decidiu aprovar a migração.\n"
+    )
+    tmpdir = tempfile.mkdtemp()
+    try:
+        file_path = Path(tmpdir) / "reuniao.md"
+        file_path.write_text(text, encoding="utf-8")
+        detected = detect_entities([file_path])
+        all_names = [e["name"] for e in detected["people"] + detected["uncertain"]]
+        assert "João" in all_names
+        assert "Inês" in all_names
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)