Skip to content

Commit 4f05ed5

Browse files
committed
feat: add Brazilian Portuguese support to entity_detector
Closes #117. Extends the detector so a file written in pt-br is treated the same way a file in English is: names are extracted as candidates, and verb / pronoun / dialogue / direct-address patterns contribute to the person vs project classification. Pure-English corpora are unchanged because every addition is additive. The concrete pieces: - New PERSON_VERB_PATTERNS_PTBR, PRONOUN_PATTERNS_PTBR, DIALOGUE_PATTERNS_PTBR constants with the Portuguese equivalents of the existing English signals (said / asked / replied / thinks / wants, plus greetings oi / olá / obrigado / caro). - _build_patterns concatenates the English and pt-br lists for the dialogue and person-verb buckets, so _every_ compiled matcher for an entity now covers both languages. - score_entity merges the English and pt-br pronoun lists for the proximity check. - extract_candidates widens its Latin-1 character class so accented names like João, Inês, Ângela, and André flow through candidate extraction instead of being silently dropped by an ASCII-only regex. - STOPWORDS adds the Portuguese greeting fillers (oi, olá, obrigado, obrigada, caro, cara) so they do not masquerade as entity candidates when they start sentences. The new tests/test_entity_detector.py covers English regression, pt-br person verbs (with a direct _build_patterns assertion so the signal source is unambiguous), pt-br pronoun proximity, direct address, a mixed-language corpus compared against English-only, Portuguese dialogue markers in quoted speech, and end-to-end detect_entities runs for both ASCII (Maria) and accented (João, Inês) names.
1 parent 0fdd086 commit 4f05ed5

2 files changed

Lines changed: 191 additions & 9 deletions

File tree

mempalace/entity_detector.py

Lines changed: 65 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,47 @@
6868
r'"{name}\s+said',
6969
]
7070

71+
# Brazilian Portuguese (pt-br) localisations — issue #117.
72+
# Additive on top of the English patterns so detection still works on mixed
73+
# English / Portuguese corpora without having to classify files by language.
74+
PERSON_VERB_PATTERNS_PTBR = [
75+
r"\b{name}\s+disse\b", # said
76+
r"\b{name}\s+perguntou\b", # asked
77+
r"\b{name}\s+respondeu\b", # replied
78+
r"\b{name}\s+contou\b", # told
79+
r"\b{name}\s+riu\b", # laughed
80+
r"\b{name}\s+sorriu\b", # smiled
81+
r"\b{name}\s+chorou\b", # cried
82+
r"\b{name}\s+sentiu\b", # felt
83+
r"\b{name}\s+pensa\b", # thinks
84+
r"\b{name}\s+quer\b", # wants
85+
r"\b{name}\s+ama\b", # loves
86+
r"\b{name}\s+odeia\b", # hates
87+
r"\b{name}\s+sabe\b", # knows
88+
r"\b{name}\s+decidiu\b", # decided
89+
r"\b{name}\s+escreveu\b", # wrote
90+
r"\boi\s+{name}\b", # hi
91+
r"\bol[áa]\s+{name}\b", # hello
92+
r"\bobrigad[oa]\s+{name}\b", # thanks
93+
r"\bcaro\s+{name}\b", # dear
94+
r"\bcara\s+{name}\b", # dear (feminine)
95+
]
96+
97+
PRONOUN_PATTERNS_PTBR = [
98+
r"\bela\b",
99+
r"\bdela\b",
100+
r"\bele\b",
101+
r"\bdele\b",
102+
r"\beles\b",
103+
r"\belas\b",
104+
r"\bdeles\b",
105+
r"\bdelas\b",
106+
]
107+
108+
DIALOGUE_PATTERNS_PTBR = [
109+
r'"{name}\s+disse',
110+
]
111+
71112
# Project signals — things projects have/do
72113
PROJECT_VERB_PATTERNS = [
73114
r"\bbuilding\s+{name}\b",
@@ -319,6 +360,14 @@
319360
"right",
320361
"let",
321362
"ok",
363+
# Brazilian Portuguese greetings and filler words
364+
"oi",
365+
"ola",
366+
"olá",
367+
"obrigado",
368+
"obrigada",
369+
"caro",
370+
"cara",
322371
# UI/action words that appear in how-to content
323372
"click",
324373
"hit",
@@ -445,16 +494,18 @@ def extract_candidates(text: str) -> dict:
445494
Extract all capitalized proper noun candidates from text.
446495
Returns {name: frequency} for names appearing 3+ times.
447496
"""
448-
# Find all capitalized words (not at sentence start — harder, so we use frequency as filter)
449-
raw = re.findall(r"\b([A-Z][a-z]{1,19})\b", text)
497+
# Find all capitalized words (not at sentence start — harder, so we use frequency as filter).
498+
# The character classes include Latin-1 supplement so accented names like
499+
# João, Inês, Ângela, and André are picked up alongside ASCII names.
500+
raw = re.findall(r"\b([A-ZÀ-ÖØ-Þ][a-zà-öø-ÿ]{1,19})\b", text)
450501

451502
counts = defaultdict(int)
452503
for word in raw:
453504
if word.lower() not in STOPWORDS and len(word) > 1:
454505
counts[word] += 1
455506

456507
# Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code")
457-
multi = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text)
508+
multi = re.findall(r"\b([A-ZÀ-ÖØ-Þ][a-zà-öø-ÿ]+(?:\s+[A-ZÀ-ÖØ-Þ][a-zà-öø-ÿ]+)+)\b", text)
458509
for phrase in multi:
459510
if not any(w.lower() in STOPWORDS for w in phrase.split()):
460511
counts[phrase] += 1
@@ -469,15 +520,19 @@ def extract_candidates(text: str) -> dict:
469520
def _build_patterns(name: str) -> dict:
470521
"""Pre-compile all regex patterns for a single entity name."""
471522
n = re.escape(name)
523+
dialogue = DIALOGUE_PATTERNS + DIALOGUE_PATTERNS_PTBR
524+
person_verbs = PERSON_VERB_PATTERNS + PERSON_VERB_PATTERNS_PTBR
472525
return {
473-
"dialogue": [
474-
re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in DIALOGUE_PATTERNS
475-
],
476-
"person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in PERSON_VERB_PATTERNS],
526+
"dialogue": [re.compile(p.format(name=n), re.MULTILINE | re.IGNORECASE) for p in dialogue],
527+
"person_verbs": [re.compile(p.format(name=n), re.IGNORECASE) for p in person_verbs],
477528
"project_verbs": [
478529
re.compile(p.format(name=n), re.IGNORECASE) for p in PROJECT_VERB_PATTERNS
479530
],
480-
"direct": re.compile(rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b", re.IGNORECASE),
531+
"direct": re.compile(
532+
rf"\bhey\s+{n}\b|\bthanks?\s+{n}\b|\bhi\s+{n}\b"
533+
rf"|\boi\s+{n}\b|\bol[áa]\s+{n}\b|\bobrigad[oa]\s+{n}\b",
534+
re.IGNORECASE,
535+
),
481536
"versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
482537
"code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
483538
}
@@ -514,9 +569,10 @@ def score_entity(name: str, text: str, lines: list) -> dict:
514569
name_lower = name.lower()
515570
name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
516571
pronoun_hits = 0
572+
all_pronoun_patterns = PRONOUN_PATTERNS + PRONOUN_PATTERNS_PTBR
517573
for idx in name_line_indices:
518574
window_text = " ".join(lines[max(0, idx - 2) : idx + 3]).lower()
519-
for pronoun_pattern in PRONOUN_PATTERNS:
575+
for pronoun_pattern in all_pronoun_patterns:
520576
if re.search(pronoun_pattern, window_text):
521577
pronoun_hits += 1
522578
break

tests/test_entity_detector.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
"""Tests for mempalace.entity_detector."""
22

33
import os
4+
import shutil
5+
import tempfile
6+
from pathlib import Path
47
from unittest.mock import patch
58

69
from mempalace.entity_detector import (
710
PROSE_EXTENSIONS,
811
STOPWORDS,
12+
_build_patterns,
913
_print_entity_list,
1014
classify_entity,
1115
confirm_entities,
@@ -378,3 +382,125 @@ def test_scan_for_detection_max_files(tmp_path):
378382
(tmp_path / f"note{i}.md").write_text(f"content {i}")
379383
files = scan_for_detection(str(tmp_path), max_files=5)
380384
assert len(files) <= 5
385+
386+
387+
# ── Brazilian Portuguese (pt-br) tests — PR #156 ─────────────────────────────
388+
389+
390+
def test_scores_english_person_verbs():
391+
text = (
392+
"Maria said hello to the team.\n"
393+
"Maria asked about the release.\n"
394+
"Maria decided to approve it.\n"
395+
"Maria wrote the spec. Maria wrote the email.\n"
396+
)
397+
scores = score_entity("Maria", text, text.splitlines())
398+
assert scores["person_score"] > 0
399+
assert any("action" in s for s in scores["person_signals"])
400+
401+
402+
def test_scores_portuguese_person_verbs():
403+
text = (
404+
"Maria disse oi para o time.\n"
405+
"Maria perguntou sobre o lançamento.\n"
406+
"Maria decidiu aprovar.\n"
407+
"Depois Maria escreveu a spec. Maria escreveu o email.\n"
408+
)
409+
scores = score_entity("Maria", text, text.splitlines())
410+
assert scores["person_score"] > 0
411+
assert any("action" in s for s in scores["person_signals"])
412+
413+
patterns = _build_patterns("Maria")
414+
assert any(rx.search("Maria disse oi") for rx in patterns["person_verbs"])
415+
assert any(rx.search("Maria perguntou algo") for rx in patterns["person_verbs"])
416+
assert any(rx.search("Maria decidiu aprovar") for rx in patterns["person_verbs"])
417+
418+
419+
def test_portuguese_pronoun_proximity():
420+
text = (
421+
"Maria estava no escritório hoje.\n"
422+
"Ela pediu café e começou a revisar o código.\n"
423+
"Dela veio a sugestão de migrar para Postgres.\n"
424+
"Maria estava no escritório hoje.\n"
425+
"Ela pediu café e começou a revisar o código.\n"
426+
"Dela veio a sugestão de migrar para Postgres.\n"
427+
)
428+
scores = score_entity("Maria", text, text.splitlines())
429+
assert any("pronoun" in s for s in scores["person_signals"])
430+
431+
432+
def test_portuguese_direct_address():
433+
text = "oi Maria, tudo bem? obrigado Maria. olá Maria, chegou o relatório."
434+
patterns = _build_patterns("Maria")
435+
assert len(patterns["direct"].findall(text)) == 3
436+
437+
scores = score_entity("Maria", text, text.splitlines())
438+
assert scores["person_score"] >= 12
439+
440+
441+
def test_mixed_english_portuguese_corpus():
442+
english_only = (
443+
"Maria said hello.\n"
444+
"Maria asked about the release.\n"
445+
"Maria decided to ship.\n"
446+
"Maria wrote the note.\n"
447+
"Maria wrote the note.\n"
448+
)
449+
mixed = (
450+
"Maria said hello. Maria disse oi.\n"
451+
"Maria asked about the release. Maria perguntou sobre o lançamento.\n"
452+
"Maria decided to ship. Maria decidiu entregar.\n"
453+
"Maria wrote the note. Maria escreveu a nota.\n"
454+
"Maria wrote the note. Maria escreveu a nota.\n"
455+
)
456+
english_scores = score_entity("Maria", english_only, english_only.splitlines())
457+
mixed_scores = score_entity("Maria", mixed, mixed.splitlines())
458+
assert mixed_scores["person_score"] > english_scores["person_score"]
459+
460+
461+
def test_portuguese_dialogue_marker_in_quoted_text():
462+
text = '"Maria disse que o deploy rodou bem."\n"Maria disse que está tudo OK."\n'
463+
scores = score_entity("Maria", text, text.splitlines())
464+
assert any("dialogue" in s for s in scores["person_signals"])
465+
466+
467+
def test_detect_entities_finds_portuguese_person():
468+
text = (
469+
"Maria disse que o deploy foi bem.\n"
470+
"Depois Maria perguntou sobre o backend.\n"
471+
"Maria decidiu aprovar a migração.\n"
472+
"Maria escreveu a documentação final.\n"
473+
"Ela é a nova líder do time.\n"
474+
)
475+
tmpdir = tempfile.mkdtemp()
476+
try:
477+
file_path = Path(tmpdir) / "reuniao.md"
478+
file_path.write_text(text, encoding="utf-8")
479+
detected = detect_entities([file_path])
480+
all_names = [e["name"] for e in detected["people"] + detected["uncertain"]]
481+
assert "Maria" in all_names
482+
finally:
483+
shutil.rmtree(tmpdir, ignore_errors=True)
484+
485+
486+
def test_detect_entities_picks_up_accented_names():
487+
text = (
488+
"João é o líder do time.\n"
489+
"João disse que o lançamento foi bem.\n"
490+
"João perguntou sobre o backend novo.\n"
491+
"Depois João escreveu o relatório.\n"
492+
"Inês entrou no projeto semana passada.\n"
493+
"Inês disse que prefere Postgres.\n"
494+
"Inês escreveu a documentação da API.\n"
495+
"Inês decidiu aprovar a migração.\n"
496+
)
497+
tmpdir = tempfile.mkdtemp()
498+
try:
499+
file_path = Path(tmpdir) / "reuniao.md"
500+
file_path.write_text(text, encoding="utf-8")
501+
detected = detect_entities([file_path])
502+
all_names = [e["name"] for e in detected["people"] + detected["uncertain"]]
503+
assert "João" in all_names
504+
assert "Inês" in all_names
505+
finally:
506+
shutil.rmtree(tmpdir, ignore_errors=True)

0 commit comments

Comments
 (0)