Skip to content

Commit a55770a

Browse files
committed
fix: do not stopword caro/cara -- they are valid first names
caro and cara were in both STOPWORDS and PERSON_VERB_PATTERNS_PTBR as direct-address markers. Because extract_candidates filters candidates against STOPWORDS before pattern scoring runs, a person literally named Cara or Caro (valid English/Italian/Portuguese first names) was silently dropped from detection. Remove caro / cara from STOPWORDS and leave the explanatory comment in place. The direct-address patterns still fire when these words precede another name (caro Maria, cara Ana), so PT-BR behaviour is unchanged for the filler-word case. oi / ola / ola / obrigado / obrigada stay as stopwords -- they are practically never first names in the corpora this detector targets.
1 parent ac4c0fd commit a55770a

2 files changed

Lines changed: 29 additions & 6 deletions

File tree

mempalace/entity_detector.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,10 @@
112112
r'"{name}\s+disse',
113113
]
114114

115+
# Combined pronoun regex for score_entity (includes EN + PT-BR).
116+
# Module-level compile to avoid per-call overhead (same approach as PRONOUN_RE).
117+
_ALL_PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS + PRONOUN_PATTERNS_PTBR), re.IGNORECASE)
118+
115119
# Project signals — things projects have/do
116120
PROJECT_VERB_PATTERNS = [
117121
r"\bbuilding\s+{name}\b",
@@ -369,8 +373,11 @@
369373
"olá",
370374
"obrigado",
371375
"obrigada",
372-
"caro",
373-
"cara",
376+
# NOTE: "caro" / "cara" are intentionally NOT stopwords -- they are valid
377+
# first names in English/Italian/Portuguese (Cara, Caro) and are already
378+
# handled as direct-address markers in PERSON_VERB_PATTERNS_PTBR when they
379+
# precede another name. Adding them here would silently drop "Cara" /
380+
# "Caro" from candidate extraction.
374381
# UI/action words that appear in how-to content
375382
"click",
376383
"hit",
@@ -573,12 +580,9 @@ def score_entity(name: str, text: str, lines: list) -> dict:
573580
name_lower = name.lower()
574581
name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
575582
pronoun_hits = 0
576-
all_pronoun_re = re.compile(
577-
"|".join(PRONOUN_PATTERNS + PRONOUN_PATTERNS_PTBR), re.IGNORECASE
578-
)
579583
for idx in name_line_indices:
580584
window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
581-
if all_pronoun_re.search(window_text):
585+
if _ALL_PRONOUN_RE.search(window_text):
582586
pronoun_hits += 1
583587
if pronoun_hits > 0:
584588
person_score += pronoun_hits * 2

tests/test_entity_detector.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,3 +504,22 @@ def test_detect_entities_picks_up_accented_names():
504504
assert "Inês" in all_names
505505
finally:
506506
shutil.rmtree(tmpdir, ignore_errors=True)
507+
508+
509+
def test_extract_candidates_keeps_cara_and_caro_as_names():
510+
# "cara" / "caro" appear in PERSON_VERB_PATTERNS_PTBR as direct-address
511+
# markers, but they are also valid first names in English/Italian/Portuguese.
512+
# STOPWORDS must not drop them before pattern scoring can run.
513+
text = (
514+
"Cara said hello to the team.\n"
515+
"Cara laughed at the joke.\n"
516+
"Cara smiled and waved.\n"
517+
"Cara joined the standup.\n"
518+
"Caro wrote the release notes.\n"
519+
"Caro reviewed the PR.\n"
520+
"Caro approved the merge.\n"
521+
"Caro deployed the build.\n"
522+
)
523+
result = extract_candidates(text)
524+
assert "Cara" in result
525+
assert "Caro" in result

0 commit comments

Comments
 (0)