Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion mempalace/entity_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,56 @@
"inference",
}

# Programming language keywords, types, traits, and framework names
# that should never be detected as entity candidates.
CODE_KEYWORDS = {
# Rust types, traits, and derive macros
"string", "vec", "debug", "clone", "copy", "send", "sync",
"serialize", "deserialize", "display", "default", "hash",
"option", "result", "box", "arc", "mutex", "cell", "ref",
"impl", "trait", "struct", "enum", "match", "async", "await",
"tokio", "serde", "anyhow", "thiserror",
# Rust/C++ common
"iterator", "into", "from",
# JavaScript / TypeScript / React
"react", "vue", "angular", "svelte", "next", "nuxt",
"node", "deno", "bun", "express", "fetch", "promise",
"component", "props", "state", "hook", "context", "reducer",
"dispatch", "effect", "callback", "memo", "fragment",
"typescript", "javascript", "jquery",
# Python
"django", "flask", "fastapi", "pytest", "numpy", "pandas",
"pydantic", "dataclass", "decorator", "generator", "yield",
# Go
"goroutine", "channel", "defer", "panic", "recover",
# General programming
"tree", "graph", "queue", "stack", "array", "map", "set",
"index", "buffer", "stream", "socket", "thread", "process",
"handler", "middleware", "router", "controller", "service",
"schema", "query", "mutation", "resolver",
"config", "logger", "parser", "builder", "factory",
"event", "listener", "observer", "adapter", "wrapper",
"payload", "request", "response", "header", "body",
"token", "session", "cookie", "cache", "proxy",
# Language / runtime names (capitalized in prose)
"rust", "python", "golang", "kotlin", "swift", "scala",
"ruby", "java", "perl", "lua", "dart", "elixir",
# Build tools / package managers
"cargo", "npm", "yarn", "pnpm", "pip", "conda", "maven", "gradle",
# Frameworks / libraries commonly capitalized
"tauri", "electron", "vite", "webpack", "babel", "eslint",
"docker", "kubernetes", "redis", "postgres", "mongo", "sqlite",
# CSS / UI
"tailwind", "bootstrap", "material",
# Version control
"git", "github", "gitlab", "bitbucket",
# Common capitalized code patterns
"todo", "fixme", "hack", "note", "bug", "feature",
"phase", "flow", "step", "stage", "task", "action",
"view", "page", "layout", "modal", "dialog", "panel",
"table", "column", "row", "field", "form", "button",
}

# For entity detection — prose only, no code files
# Code files have too many capitalized names (classes, functions) that aren't entities
PROSE_EXTENSIONS = {
Expand Down Expand Up @@ -450,7 +500,7 @@ def extract_candidates(text: str) -> dict:

counts = defaultdict(int)
for word in raw:
if word.lower() not in STOPWORDS and len(word) > 1:
if word.lower() not in STOPWORDS and word.lower() not in CODE_KEYWORDS and len(word) > 1:
counts[word] += 1

# Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code")
Expand Down
102 changes: 102 additions & 0 deletions tests/test_code_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Tests for entity detector CODE_KEYWORDS filtering (#348)."""

from mempalace.entity_detector import extract_candidates, detect_entities, CODE_KEYWORDS, STOPWORDS


class TestCodeKeywordsFiltering:
"""Verify that programming keywords are excluded from entity candidates."""

def test_rust_types_excluded(self):
"""Rust types like String, Vec, Debug should not be candidates."""
text = "String " * 10 + "Vec " * 10 + "Debug " * 10 + "Clone " * 10
candidates = extract_candidates(text)
for keyword in ["String", "Vec", "Debug", "Clone"]:
assert keyword not in candidates, f"{keyword} should be filtered by CODE_KEYWORDS"

def test_rust_derive_macros_excluded(self):
"""Serialize, Deserialize should not be candidates."""
text = "Serialize " * 10 + "Deserialize " * 10
candidates = extract_candidates(text)
assert "Serialize" not in candidates
assert "Deserialize" not in candidates

def test_framework_names_excluded(self):
"""React, Tauri, Node, Vue should not be candidates."""
text = "React " * 10 + "Tauri " * 10 + "Node " * 10 + "Vue " * 10
candidates = extract_candidates(text)
for name in ["React", "Tauri", "Node", "Vue"]:
assert name not in candidates, f"{name} should be filtered"

def test_language_names_excluded(self):
"""Rust, Python, Kotlin etc should not be candidates."""
text = "Rust " * 10 + "Python " * 10 + "Kotlin " * 10
candidates = extract_candidates(text)
for name in ["Rust", "Python", "Kotlin"]:
assert name not in candidates, f"{name} should be filtered"

def test_common_code_patterns_excluded(self):
"""Phase, Flow, Tree, Graph should not be candidates."""
text = "Phase " * 10 + "Flow " * 10 + "Tree " * 10 + "Graph " * 10
candidates = extract_candidates(text)
for name in ["Phase", "Flow", "Tree", "Graph"]:
assert name not in candidates, f"{name} should be filtered"

def test_real_project_names_not_excluded(self):
"""Actual project names like CodeMAP, MalCheck should still be detected."""
# These are not in CODE_KEYWORDS or STOPWORDS
assert "codemap" not in CODE_KEYWORDS
assert "malcheck" not in CODE_KEYWORDS
assert "codemap" not in STOPWORDS
assert "malcheck" not in STOPWORDS

def test_real_person_names_not_excluded(self):
"""Real person names should still be candidates."""
text = "Alice " * 10 + "Bob " * 10 + "Charlie " * 10
candidates = extract_candidates(text)
assert "Alice" in candidates
assert "Bob" in candidates
assert "Charlie" in candidates

def test_code_keywords_are_lowercase(self):
"""All CODE_KEYWORDS entries should be lowercase for consistent matching."""
for keyword in CODE_KEYWORDS:
assert keyword == keyword.lower(), f"CODE_KEYWORDS entry '{keyword}' should be lowercase"

def test_no_overlap_with_stopwords(self):
"""CODE_KEYWORDS should not duplicate STOPWORDS entries (keep sets clean)."""
overlap = CODE_KEYWORDS & STOPWORDS
# Some overlap is acceptable but flag it for awareness
# This test documents the current state rather than enforcing zero overlap
assert isinstance(overlap, set) # just verify it runs

def test_detect_entities_with_code_heavy_content(self, tmp_path):
"""Full pipeline: code-heavy files should not produce false project detections."""
# Create a fake Rust-like file
rust_content = """
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize)]
struct Config {
name: String,
values: Vec<String>,
}
impl Default for Config {
fn default() -> Self { Config { name: String::new(), values: Vec::new() } }
}
""" * 5

test_file = tmp_path / "main.rs"
test_file.write_text(rust_content)

# Also create a prose file mentioning a real project
prose_file = tmp_path / "README.md"
prose_file.write_text("MyProject is a tool for data analysis. " * 10)

result = detect_entities([prose_file, test_file], max_files=10)

# Rust keywords should NOT appear as projects
project_names = [e["name"] for e in result["projects"]]
uncertain_names = [e["name"] for e in result["uncertain"]]
all_detected = project_names + uncertain_names

for keyword in ["String", "Vec", "Debug", "Clone", "Serialize", "Deserialize"]:
assert keyword not in all_detected, f"{keyword} should not be detected"