-
Notifications
You must be signed in to change notification settings - Fork 2
fix: dead-code benchmark correctness + binary asset filtering #5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
cbcf1db
eb295db
bd4fae5
c3db97b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,41 @@ | ||
| """Git utilities for cloning repos and creating zip archives.""" | ||
|
|
||
| import asyncio | ||
| import fnmatch | ||
| import logging | ||
| import os | ||
| import zipfile | ||
|
|
||
| logger = logging.getLogger("mcpbr.supermodel") | ||
|
|
||
| # Binary / media assets that are irrelevant for code analysis. | ||
| # Applied to every zip regardless of per-task zip_exclude config. | ||
| BINARY_EXCLUDE_PATTERNS = [ | ||
| "*.mp4", | ||
| "*.mov", | ||
| "*.avi", | ||
| "*.webm", | ||
| "*.png", | ||
| "*.jpg", | ||
| "*.jpeg", | ||
| "*.gif", | ||
| "*.svg", | ||
| "*.ico", | ||
| "*.webp", | ||
| "*.woff", | ||
| "*.woff2", | ||
| "*.ttf", | ||
| "*.eot", | ||
| "*.otf", | ||
| "*.pdf", | ||
| "*.zip", | ||
| "*.tar", | ||
| "*.gz", | ||
| "*.mp3", | ||
| "*.wav", | ||
| "*.ogg", | ||
| ] | ||
|
|
||
|
|
||
| async def clone_repo_at_commit(repo: str, commit: str, dest: str) -> None: | ||
| """Clone a repo and checkout a specific commit. | ||
|
|
@@ -117,18 +148,25 @@ async def zip_repo( | |
|
|
||
| is_git = os.path.isdir(os.path.join(repo_dir, ".git")) | ||
|
|
||
| all_excludes = BINARY_EXCLUDE_PATTERNS + (exclude_patterns or []) | ||
|
|
||
| if is_git: | ||
| return await _zip_repo_git_archive(repo_dir, output_zip, scope_prefix) | ||
| return await _zip_repo_git_archive(repo_dir, output_zip, scope_prefix, all_excludes) | ||
| else: | ||
| return await _zip_repo_fallback(repo_dir, output_zip, scope_prefix, exclude_patterns) | ||
| return await _zip_repo_fallback(repo_dir, output_zip, scope_prefix, all_excludes) | ||
|
|
||
|
|
||
| async def _zip_repo_git_archive( | ||
| repo_dir: str, | ||
| output_zip: str, | ||
| scope_prefix: str | None = None, | ||
| exclude_patterns: list[str] | None = None, | ||
| ) -> str: | ||
| """Create zip using ``git archive`` — only includes tracked files.""" | ||
| """Create zip using ``git archive`` — only includes tracked files. | ||
|
|
||
| If exclude_patterns are provided, rewrites the zip to strip matching entries | ||
| (git archive has no native exclude support). | ||
| """ | ||
| cmd = ["git", "archive", "--format=zip", "-o", output_zip, "HEAD"] | ||
| if scope_prefix: | ||
| cmd.append(scope_prefix) | ||
|
|
@@ -142,9 +180,37 @@ async def _zip_repo_git_archive( | |
| _, stderr = await asyncio.wait_for(proc.communicate(), timeout=120) | ||
| if proc.returncode != 0: | ||
| raise RuntimeError(f"git archive failed: {stderr.decode()}") | ||
|
|
||
| if exclude_patterns: | ||
| _filter_zip_entries(output_zip, exclude_patterns) | ||
|
|
||
| return output_zip | ||
|
|
||
|
|
||
| def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None: | ||
| """Rewrite zip in-place, removing entries whose basename matches any glob pattern.""" | ||
| tmp_path = zip_path + ".tmp" | ||
| removed = 0 | ||
| try: | ||
| with ( | ||
| zipfile.ZipFile(zip_path, "r") as zin, | ||
| zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as zout, | ||
| ): | ||
| for item in zin.infolist(): | ||
| basename = os.path.basename(item.filename) | ||
| if any(fnmatch.fnmatch(basename, pat) for pat in patterns): | ||
| removed += 1 | ||
| continue | ||
|
Comment on lines
+199
to
+203
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Path-based excludes are silently ignored in git-archive filtering.
💡 Proposed fix def _filter_zip_entries(zip_path: str, patterns: list[str]) -> None:
"""Rewrite zip in-place, removing entries whose basename matches any glob pattern."""
@@
for item in zin.infolist():
- basename = os.path.basename(item.filename)
- if any(fnmatch.fnmatch(basename, pat) for pat in patterns):
+ rel_path = item.filename.lstrip("./")
+ basename = os.path.basename(rel_path)
+ if any(
+ fnmatch.fnmatch(rel_path, pat) or fnmatch.fnmatch(basename, pat)
+ for pat in patterns
+ ):
removed += 1
continue
zout.writestr(item, zin.read(item.filename))🤖 Prompt for AI Agents |
||
| zout.writestr(item, zin.read(item.filename)) | ||
| os.replace(tmp_path, zip_path) | ||
| except Exception: | ||
| if os.path.exists(tmp_path): | ||
| os.unlink(tmp_path) | ||
| raise | ||
| if removed: | ||
| logger.info(f"Filtered {removed} binary entries from {os.path.basename(zip_path)}") | ||
|
|
||
|
|
||
| async def _zip_repo_fallback( | ||
| repo_dir: str, | ||
| output_zip: str, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Prompt/schema mismatch will zero out results on normal runs.
This script reads only
analysis["deadCodeCandidates"], but the generatedsupermodel_dead_code_analysis.jsonindex is chunked (chunkFiles) insrc/mcpbr/benchmarks/supermodel/benchmark.py(Line 514-518). Result: script writes emptydead_codedespite real candidates existing in chunk files.🔧 Suggested prompt-script update (support both schemas)
Also applies to: 147-148, 171-174
🤖 Prompt for AI Agents