Skip to content

Commit dfd7511

Browse files
authored
Add heuristics-based minified file detection (#619)
# Motivation <!-- Why is this change necessary? --> # Content <!-- Please include a summary of the change --> # Testing <!-- How was the change tested? --> # Please check the following before marking your PR as ready for review - [ ] I have added tests for my changes - [ ] I have updated the documentation or added new documentation as needed
1 parent 68b8a1a commit dfd7511

File tree

3 files changed

+20
-1
lines changed

3 files changed

+20
-1
lines changed

src/codegen/sdk/codebase/codebase_context.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
".*/ace/.*.js",
6868
"src/vs/platform/contextview/browser/contextMenuService.ts",
6969
"*/compiled/*",
70-
"*/*.min.js",
70+
"*.min.js",
7171
]
7272

7373

src/codegen/sdk/core/file.py

+8
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545

4646
logger = logging.getLogger(__name__)
4747

48+
MINIFIED_FILE_THRESHOLD = 500
49+
4850

4951
@apidoc
5052
class File(Editable[None]):
@@ -577,6 +579,12 @@ def invalidate(self):
577579
def from_content(cls, filepath: str | PathLike | Path, content: str, ctx: CodebaseContext, sync: bool = True, verify_syntax: bool = True) -> Self | None:
578580
"""Creates a new file from content and adds it to the graph."""
579581
path = ctx.to_absolute(filepath)
582+
583+
# Sanity check to ensure file is not a minified file
584+
if any(len(line) >= MINIFIED_FILE_THRESHOLD for line in content.split("\n")):
585+
logger.info(f"File {filepath} is a minified file (Line length < {MINIFIED_FILE_THRESHOLD}). Skipping...", extra={"filepath": filepath})
586+
return None
587+
580588
ts_node = parse_file(path, content)
581589
if ts_node.has_error and verify_syntax:
582590
logger.info("Failed to parse file %s", filepath)

tests/unit/codegen/sdk/codebase/file/test_file.py

+11
Original file line numberDiff line numberDiff line change
@@ -211,3 +211,14 @@ def test_files_in_subdirectories_case_sensitivity(tmpdir) -> None:
211211
assert codebase.has_file("SubDir3/File3.py", ignore_case=False)
212212
assert not codebase.has_file("SUBDIR3/FILE3.py", ignore_case=False)
213213
assert not codebase.has_file("subdir3/file3.py", ignore_case=False)
214+
215+
216+
def test_minified_file(tmpdir) -> None:
217+
with get_codebase_session(tmpdir=tmpdir, files={"file1.min.js": "console.log(123)", "file2.js": f"console.log(1{'0' * 1000})"}) as codebase:
218+
# This should match the `*.min.js` pattern
219+
file1 = codebase.ctx.get_file("file1.min.js")
220+
assert file1 is None
221+
222+
# This should match the maximum line length threshold
223+
file2 = codebase.ctx.get_file("file2.js")
224+
assert file2 is None

0 commit comments

Comments
 (0)