fix: do not stopword caro/cara -- they are valid first names

mvalentsev · mvalentsev · commit a55770a999ce · 2026-04-14T04:49:00.000+05:00
caro and cara were in both STOPWORDS and PERSON_VERB_PATTERNS_PTBR as
direct-address markers. Because extract_candidates filters candidates
against STOPWORDS before pattern scoring runs, a person literally
named Cara or Caro (valid English/Italian/Portuguese first names) was
silently dropped from detection.

Remove caro / cara from STOPWORDS and leave the explanatory comment in
place. The direct-address patterns still fire when these words precede
another name (caro Maria, cara Ana), so PT-BR behaviour is unchanged
for the filler-word case.

oi / ola / ola / obrigado / obrigada stay as stopwords -- they are
practically never first names in the corpora this detector targets.
diff --git a/mempalace/entity_detector.py b/mempalace/entity_detector.py
@@ -112,6 +112,10 @@
     r'"{name}\s+disse',
 ]
 
+# Combined pronoun regex for score_entity (includes EN + PT-BR).
+# Module-level compile to avoid per-call overhead (same approach as PRONOUN_RE).
+_ALL_PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS + PRONOUN_PATTERNS_PTBR), re.IGNORECASE)
+
 # Project signals — things projects have/do
 PROJECT_VERB_PATTERNS = [
     r"\bbuilding\s+{name}\b",
@@ -369,8 +373,11 @@
     "olá",
     "obrigado",
     "obrigada",
-    "caro",
-    "cara",
+    # NOTE: "caro" / "cara" are intentionally NOT stopwords -- they are valid
+    # first names in English/Italian/Portuguese (Cara, Caro) and are already
+    # handled as direct-address markers in PERSON_VERB_PATTERNS_PTBR when they
+    # precede another name. Adding them here would silently drop "Cara" /
+    # "Caro" from candidate extraction.
     # UI/action words that appear in how-to content
     "click",
     "hit",
@@ -573,12 +580,9 @@ def score_entity(name: str, text: str, lines: list) -> dict:
     name_lower = name.lower()
     name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
     pronoun_hits = 0
-    all_pronoun_re = re.compile(
-        "|".join(PRONOUN_PATTERNS + PRONOUN_PATTERNS_PTBR), re.IGNORECASE
-    )
     for idx in name_line_indices:
         window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
-        if all_pronoun_re.search(window_text):
+        if _ALL_PRONOUN_RE.search(window_text):
             pronoun_hits += 1
     if pronoun_hits > 0:
         person_score += pronoun_hits * 2
diff --git a/tests/test_entity_detector.py b/tests/test_entity_detector.py
@@ -504,3 +504,22 @@ def test_detect_entities_picks_up_accented_names():
         assert "Inês" in all_names
     finally:
         shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+def test_extract_candidates_keeps_cara_and_caro_as_names():
+    # "cara" / "caro" appear in PERSON_VERB_PATTERNS_PTBR as direct-address
+    # markers, but they are also valid first names in English/Italian/Portuguese.
+    # STOPWORDS must not drop them before pattern scoring can run.
+    text = (
+        "Cara said hello to the team.\n"
+        "Cara laughed at the joke.\n"
+        "Cara smiled and waved.\n"
+        "Cara joined the standup.\n"
+        "Caro wrote the release notes.\n"
+        "Caro reviewed the PR.\n"
+        "Caro approved the merge.\n"
+        "Caro deployed the build.\n"
+    )
+    result = extract_candidates(text)
+    assert "Cara" in result
+    assert "Caro" in result