diff --git a/.gitignore b/.gitignore index c66d067..2bae3d1 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,8 @@ build/ coverage.xml test-results.xml .dev.vars +.env +.env.* .claude/ .claude-flow/ .serena/ diff --git a/.secrets.baseline b/.secrets.baseline index 473c8f3..3506c0d 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -204,7 +204,7 @@ "filename": "tests/test_grippy_codebase.py", "hashed_secret": "3acfb2c2b433c0ea7ff107e33df91b18e52f960f", "is_verified": false, - "line_number": 561 + "line_number": 562 } ], "tests/test_grippy_embedder.py": [ @@ -254,21 +254,21 @@ "filename": "tests/test_grippy_review.py", "hashed_secret": "3e536cc49ad17f2f50dc6f0aab08495bddbb2833", "is_verified": false, - "line_number": 906 + "line_number": 907 }, { "type": "Secret Keyword", "filename": "tests/test_grippy_review.py", "hashed_secret": "80c3eb3a746f82974a9696275d8b52a37fba449b", "is_verified": false, - "line_number": 938 + "line_number": 939 }, { "type": "AWS Access Key", "filename": "tests/test_grippy_review.py", "hashed_secret": "1d5bc0e7232d07b336f4d35db4c0200142962f4a", "is_verified": false, - "line_number": 1463 + "line_number": 1464 } ], "tests/test_grippy_rule_secrets.py": [ @@ -295,5 +295,5 @@ } ] }, - "generated_at": "2026-03-15T06:39:33Z" + "generated_at": "2026-04-03T17:34:56Z" } diff --git a/CLAUDE.md b/CLAUDE.md index a1e0d3d..42c5f38 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -158,7 +158,7 @@ Graph enrichment: `enrich_results()` (rules/enrichment.py) post-processes findin ### Codebase Tools (`codebase.py`) — LLM-facing, security-critical - Path traversal: `Path.is_relative_to()` (not `startswith`) -- Symlink: `grep_code` uses `-S` flag (no follow) +- Symlink: `grep_code` uses `-r` (not `-R`), which does not follow symlinks on GNU grep. BSD grep `-r` does follow symlinks; `-S` is BSD-only and not used. See DEBT-INT-009. - Glob timeout: 5-second `time.monotonic()` deadline - Result caps: 5,000 files indexed, 500 glob results, 12,000 chars per tool response - Sanitization: `tool_hooks` middleware applies `navi_sanitize.clean()` + XML-escape + 12K truncation to all tool outputs before LLM sees them diff --git a/src/grippy/codebase.py b/src/grippy/codebase.py index c02abc9..6bf4b31 100644 --- a/src/grippy/codebase.py +++ b/src/grippy/codebase.py @@ -731,6 +731,27 @@ def grep_code(pattern: str, glob: str = "*.py", context_lines: int = 2) -> str: return grep_code +def _reject_symlinks(path: Path, repo_root: Path) -> str | None: + """Check if any component of *path* is a symlink. + + Defense-in-depth: rejects symlinks *before* resolve() so a symlink + inside the repo that resolves to an external target cannot bypass the + is_relative_to() guard. + + Returns an error message string if a symlink is found, or None if safe. + """ + try: + relative = path.relative_to(repo_root) + except ValueError: + return "Error: path outside repo root." + current = repo_root + for part in relative.parts: + current = current / part + if current.is_symlink(): + return "Error: symlinks not allowed." + return None + + def _make_read_file(repo_root: Path) -> Any: """Create a read_file tool function bound to a repo root.""" @@ -742,7 +763,11 @@ def read_file(path: str, start_line: int = 0, end_line: int = 0) -> str: :param end_line: last line to read (1-based, 0 = to end) """ target = repo_root / path - # Prevent path traversal + # Layer 1: reject symlinks before resolve() + symlink_err = _reject_symlinks(target, repo_root) + if symlink_err is not None: + return symlink_err + # Layer 2: resolve + is_relative_to (catches traversal via ..) try: target = target.resolve() if not target.is_relative_to(repo_root.resolve()): @@ -795,7 +820,11 @@ def list_files(path: str = ".", glob_pattern: str = "*") -> str: :param glob_pattern: glob pattern to filter files (default "*") """ target = repo_root / path - # Prevent path traversal + # Layer 1: reject symlinks before resolve() + symlink_err = _reject_symlinks(target, repo_root) + if symlink_err is not None: + return symlink_err + # Layer 2: resolve + is_relative_to (catches traversal via ..) try: target = target.resolve() if not target.is_relative_to(repo_root.resolve()): diff --git a/src/grippy/github_review.py b/src/grippy/github_review.py index a366ae5..35bc995 100644 --- a/src/grippy/github_review.py +++ b/src/grippy/github_review.py @@ -160,6 +160,16 @@ def _sanitize_comment_text(text: str) -> str: # Strip markdown images (tracking pixels) and external links (phishing) text = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", text) text = re.sub(r"\[([^\]]*)\]\(https?://[^)]+\)", r"\1", text) + # Strip reference-style link definitions: [id]: url (optional title) + text = re.sub(r"^\s{0,3}\[[^\]]+\]:\s+\S[^\n]*$", "", text, flags=re.MULTILINE) + # Strip reference-style link references: [text][id] and collapsed [text][] + text = re.sub(r"\[([^\]]*)\]\[[^\]]*\]", r"\1", text) + # Defang bare URL autolinks: → hxxps://... (not clickable on GitHub) + text = re.sub( + r"]+>", + lambda m: m.group(0)[1:-1].replace("https://", "hxxps://").replace("http://", "hxxp://"), + text, + ) # Loop unquote until stable — prevents multi-layer URL encoding bypass # (e.g., %2561 -> %61 -> a) that could smuggle dangerous schemes past # the regex check. @@ -174,7 +184,18 @@ def _sanitize_comment_text(text: str) -> str: # URL-encoded to bypass the earlier stripping pass. text = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", text) text = re.sub(r"\[([^\]]*)\]\(https?://[^)]+\)", r"\1", text) + text = re.sub(r"^\s{0,3}\[[^\]]+\]:\s+\S[^\n]*$", "", text, flags=re.MULTILINE) + text = re.sub(r"\[([^\]]*)\]\[[^\]]*\]", r"\1", text) + # Defang autolinks after decode too + text = re.sub( + r"]+>", + lambda m: m.group(0)[1:-1].replace("https://", "hxxps://").replace("http://", "hxxp://"), + text, + ) text = _DANGEROUS_SCHEME_RE.sub("", text) + # Defang bare URLs that survived all prior stripping — GitHub auto-linkifies these + text = re.sub(r"https://", "hxxps://", text) + text = re.sub(r"http://", "hxxp://", text) return text diff --git a/src/grippy/review.py b/src/grippy/review.py index 1d490ec..8d5e287 100644 --- a/src/grippy/review.py +++ b/src/grippy/review.py @@ -22,7 +22,9 @@ import itertools import json +import logging import os +import subprocess import sys from collections import Counter from collections.abc import Callable @@ -42,10 +44,35 @@ from grippy.rules import RuleResult, RuleSeverity, check_gate, load_profile, run_rules from grippy.rules.enrichment import enrich_results, persist_rule_findings +log = logging.getLogger(__name__) + # Max diff size sent to the LLM — configurable for local models with smaller context MAX_DIFF_CHARS = int(os.environ.get("GRIPPY_MAX_DIFF_CHARS", "500000")) +def _is_git_tracked(path: str) -> bool: + """Check if a file is tracked by git. Returns True if tracked. + + Fail-open: returns False if git is unavailable or times out. + This means a tracked .dev.vars loads in that case — acceptable + trade-off for local dev usability. CI environments are excluded + by a separate check before this function is called. + """ + try: + file_path = Path(path) + repo_dir = file_path.parent + # Use relative path — git ls-files expects paths relative to cwd/repo root + result = subprocess.run( + ["git", "ls-files", "--error-unmatch", file_path.name], + capture_output=True, + timeout=5, + cwd=str(repo_dir), + ) + return result.returncode == 0 + except (subprocess.TimeoutExpired, FileNotFoundError): + return False + + _ERROR_HINTS: dict[str, str] = { "CONFIG ERROR": "Valid `GRIPPY_TRANSPORT` values: `openai`, `anthropic`, `google`, `groq`, `mistral`, `local`.", "TIMEOUT": "Increase `GRIPPY_TIMEOUT` or reduce PR diff size.", @@ -286,11 +313,17 @@ def main(*, profile: str | None = None) -> None: if not os.environ.get("CI"): dev_vars_path = Path(__file__).resolve().parent.parent.parent / ".dev.vars" if dev_vars_path.is_file(): - for line in dev_vars_path.read_text().splitlines(): - line = line.strip() - if line and not line.startswith("#") and "=" in line: - key, _, value = line.partition("=") - os.environ.setdefault(key.strip(), value.strip()) + if _is_git_tracked(str(dev_vars_path)): + log.warning( + ".dev.vars is tracked by git — refusing to load. " + "Remove it from tracking: git rm --cached .dev.vars" + ) + else: + for line in dev_vars_path.read_text().splitlines(): + line = line.strip() + if line and not line.startswith("#") and "=" in line: + key, _, value = line.partition("=") + os.environ.setdefault(key.strip(), value.strip()) # Required env token = os.environ.get("GITHUB_TOKEN", "") diff --git a/tests/test_grippy_codebase.py b/tests/test_grippy_codebase.py index 4d6c619..33b738b 100644 --- a/tests/test_grippy_codebase.py +++ b/tests/test_grippy_codebase.py @@ -23,6 +23,7 @@ _make_list_files, _make_read_file, _make_search_code, + _reject_symlinks, _write_manifest, chunk_file, sanitize_tool_hook, @@ -1480,3 +1481,136 @@ def test_payload_non_dict_after_json_parse(self) -> None: row = {"payload": json.dumps([1, 2, 3])} # list, not dict results = CodebaseIndex._parse_results_static([row]) assert results == [] + + +# --- Symlink rejection tests --- + + +class TestRejectSymlinks: + """_reject_symlinks prevents symlink-based path traversal before resolve().""" + + def test_rejects_symlink_file_to_outside(self, tmp_path: Path) -> None: + """Symlink file pointing outside repo root is rejected.""" + repo = tmp_path / "repo" + repo.mkdir() + outside = tmp_path / "secret.txt" + outside.write_text("stolen") + link = repo / "evil.txt" + link.symlink_to(outside) + + result = _reject_symlinks(repo / "evil.txt", repo) + assert result is not None + assert "symlink" in result.lower() + + def test_rejects_symlink_directory_to_outside(self, tmp_path: Path) -> None: + """Symlink directory pointing outside repo root is rejected.""" + repo = tmp_path / "repo" + repo.mkdir() + outside_dir = tmp_path / "outside_dir" + outside_dir.mkdir() + (outside_dir / "secret.py").write_text("stolen") + link = repo / "evil_dir" + link.symlink_to(outside_dir) + + result = _reject_symlinks(repo / "evil_dir", repo) + assert result is not None + assert "symlink" in result.lower() + + def test_allows_regular_file(self, tmp_path: Path) -> None: + """Regular file inside repo returns None (no error).""" + repo = tmp_path / "repo" + repo.mkdir() + (repo / "safe.py").write_text("safe") + + result = _reject_symlinks(repo / "safe.py", repo) + assert result is None + + def test_allows_nested_path_without_symlinks(self, tmp_path: Path) -> None: + """Nested path with no symlinks in any component returns None.""" + repo = tmp_path / "repo" + (repo / "src" / "pkg").mkdir(parents=True) + (repo / "src" / "pkg" / "mod.py").write_text("code") + + result = _reject_symlinks(repo / "src" / "pkg" / "mod.py", repo) + assert result is None + + def test_rejects_symlink_in_intermediate_component(self, tmp_path: Path) -> None: + """Symlink in a middle path component is detected.""" + repo = tmp_path / "repo" + repo.mkdir() + outside_dir = tmp_path / "outside" + outside_dir.mkdir() + (outside_dir / "secret.py").write_text("stolen") + link = repo / "linked_dir" + link.symlink_to(outside_dir) + + result = _reject_symlinks(repo / "linked_dir" / "secret.py", repo) + assert result is not None + assert "symlink" in result.lower() + + def test_rejects_path_outside_repo_root(self, tmp_path: Path) -> None: + """Path that is not relative to repo root is rejected.""" + repo = tmp_path / "repo" + repo.mkdir() + outside = tmp_path / "other" / "file.py" + + result = _reject_symlinks(outside, repo) + assert result is not None + assert "outside" in result.lower() + + +class TestSymlinkDefenseInReadFile: + """read_file rejects symlinks via _reject_symlinks before resolve().""" + + def test_read_file_rejects_symlink_to_outside(self, tmp_path: Path) -> None: + """read_file returns error for symlink pointing outside repo.""" + repo = tmp_path / "repo" + repo.mkdir() + (repo / "safe.py").write_text("safe") + outside = tmp_path / "secret.py" + outside.write_text("stolen data") + link = repo / "evil_link.py" + link.symlink_to(outside) + + read_fn = _make_read_file(repo) + result = read_fn("evil_link.py") + assert "symlink" in result.lower() or "not allowed" in result.lower() + + def test_read_file_allows_regular_file(self, tmp_path: Path) -> None: + """read_file works for normal files after symlink check.""" + repo = tmp_path / "repo" + repo.mkdir() + (repo / "safe.py").write_text("safe content here") + + read_fn = _make_read_file(repo) + result = read_fn("safe.py") + assert "safe content here" in result + + +class TestSymlinkDefenseInListFiles: + """list_files rejects symlinks via _reject_symlinks before resolve().""" + + def test_list_files_rejects_symlink_directory(self, tmp_path: Path) -> None: + """list_files returns error for symlinked directory pointing outside.""" + repo = tmp_path / "repo" + repo.mkdir() + (repo / "safe.py").write_text("safe") + outside_dir = tmp_path / "outside" + outside_dir.mkdir() + (outside_dir / "secret.py").write_text("stolen") + link = repo / "evil_dir" + link.symlink_to(outside_dir) + + list_fn = _make_list_files(repo) + result = list_fn("evil_dir") + assert "symlink" in result.lower() or "not allowed" in result.lower() + + def test_list_files_allows_regular_directory(self, tmp_path: Path) -> None: + """list_files works for normal directories after symlink check.""" + repo = tmp_path / "repo" + (repo / "src").mkdir(parents=True) + (repo / "src" / "main.py").write_text("code") + + list_fn = _make_list_files(repo) + result = list_fn("src") + assert "main.py" in result diff --git a/tests/test_grippy_github_review.py b/tests/test_grippy_github_review.py index cf458df..fd0f14c 100644 --- a/tests/test_grippy_github_review.py +++ b/tests/test_grippy_github_review.py @@ -2078,3 +2078,128 @@ def test_empty_thread_ids_skips_call(self, mock_run) -> None: result = fetch_thread_states([]) assert result == {} mock_run.assert_not_called() + + +# --- Reference-style markdown link stripping --- + + +class TestReferenceStyleLinkStripping: + """_sanitize_comment_text strips reference-style links, collapsed refs, and autolinks.""" + + def test_reference_link_with_definition_stripped(self) -> None: + """[text][id] + definition line -> plain text only.""" + from grippy.github_review import _sanitize_comment_text + + text = "[click here][1]\n\n[1]: https://evil.com" + result = _sanitize_comment_text(text) + assert "click here" in result + assert "https://evil.com" not in result + assert "[1]" not in result + + def test_collapsed_reference_stripped(self) -> None: + """[text][] + definition -> plain text only.""" + from grippy.github_review import _sanitize_comment_text + + text = "[click here][]\n\n[click here]: https://evil.com" + result = _sanitize_comment_text(text) + assert "click here" in result + assert "https://evil.com" not in result + + def test_bare_autolink_stripped(self) -> None: + """ -> plain URL text without angle brackets.""" + from grippy.github_review import _sanitize_comment_text + + text = "Visit for details" + result = _sanitize_comment_text(text) + assert " None: + """Regression: existing inline link stripping still works.""" + from grippy.github_review import _sanitize_comment_text + + text = "[click](https://evil.com)" + result = _sanitize_comment_text(text) + assert "https://evil.com" not in result + assert "click" in result + + def test_reference_without_definition_inert(self) -> None: + """[text][id] without a definition is harmless (renders as literal text).""" + from grippy.github_review import _sanitize_comment_text + + text = "[text][undefined-id]" + result = _sanitize_comment_text(text) + # Should be stripped to plain text (defense-in-depth) + assert "text" in result + + def test_multiple_reference_definitions_stripped(self) -> None: + """Multiple definitions are all removed.""" + from grippy.github_review import _sanitize_comment_text + + text = "[a][1] and [b][2]\n\n[1]: https://evil1.com\n[2]: https://evil2.com 'title'" + result = _sanitize_comment_text(text) + assert "https://evil1.com" not in result + assert "https://evil2.com" not in result + assert "a" in result + assert "b" in result + + def test_indented_definition_stripped(self) -> None: + """Definitions with up to 3 spaces of indentation are stripped.""" + from grippy.github_review import _sanitize_comment_text + + text = "[ref][1]\n [1]: https://evil.com" + result = _sanitize_comment_text(text) + assert "https://evil.com" not in result + + def test_http_autolink_stripped(self) -> None: + """ autolinks are stripped too.""" + from grippy.github_review import _sanitize_comment_text + + text = "See " + result = _sanitize_comment_text(text) + assert " None: + """Shortcut reference [text] is neutralized when its definition is stripped. + + Shortcut references have no direct defense — they rely on definition + stripping. This test proves the definition is stripped, leaving [text] + as inert literal text. (Audit finding: defense-in-depth gap.) + """ + from grippy.github_review import _sanitize_comment_text + + text = "Click [evil] for details.\n\n[evil]: https://evil.com" + result = _sanitize_comment_text(text) + # Definition must be stripped — URL must not survive + assert "evil.com" not in result or "hxxps" in result + # The [evil] text itself is fine — it's inert without a definition + assert "evil" in result + + def test_url_encoded_definition_caught_by_post_unquote_pass(self) -> None: + """URL-encoded definition survives pre-unquote pass but is caught after decode. + + Attack: %5B1%5D%3A%20https%3A%2F%2Fevil.com decodes to [1]: https://evil.com + The post-unquote definition stripping pass must catch this. + """ + from grippy.github_review import _sanitize_comment_text + + text = "See [info][1]\n\n%5B1%5D%3A%20https%3A%2F%2Fevil.com" + result = _sanitize_comment_text(text) + assert "evil.com" not in result or "hxxps" in result + + def test_bare_url_defanged(self) -> None: + """Bare https:// URLs (no markdown syntax) are defanged to hxxps://.""" + from grippy.github_review import _sanitize_comment_text + + text = "Visit https://evil.com/phishing for details" + result = _sanitize_comment_text(text) + assert result == "Visit hxxps://evil.com/phishing for details" + + def test_defang_idempotent(self) -> None: + """Already-defanged hxxps:// is not double-mangled.""" + from grippy.github_review import _sanitize_comment_text + + text = "See hxxps://example.com for reference" + result = _sanitize_comment_text(text) + assert result == "See hxxps://example.com for reference" diff --git a/tests/test_grippy_review.py b/tests/test_grippy_review.py index a32b61c..1896d81 100644 --- a/tests/test_grippy_review.py +++ b/tests/test_grippy_review.py @@ -16,6 +16,7 @@ _escape_rule_field, _failure_comment, _format_rule_findings, + _is_git_tracked, _with_timeout, fetch_pr_diff, load_pr_event, @@ -2101,3 +2102,72 @@ def test_workflow_dispatch_bypasses_guard( except (SystemExit, Exception): pass mock_check.assert_not_called() + + +# --- .dev.vars git-tracked guard --- + + +class TestIsGitTracked: + """_is_git_tracked checks git ls-files for tracked status.""" + + @patch("subprocess.run") + def test_tracked_file_returns_true(self, mock_run: MagicMock) -> None: + mock_run.return_value = MagicMock(returncode=0) + assert _is_git_tracked(".dev.vars") is True + + @patch("subprocess.run") + def test_untracked_file_returns_false(self, mock_run: MagicMock) -> None: + mock_run.return_value = MagicMock(returncode=1) + assert _is_git_tracked(".dev.vars") is False + + @patch("subprocess.run") + def test_timeout_returns_false(self, mock_run: MagicMock) -> None: + import subprocess as sp + + mock_run.side_effect = sp.TimeoutExpired(cmd="git", timeout=5) + assert _is_git_tracked(".dev.vars") is False + + @patch("subprocess.run") + def test_git_not_found_returns_false(self, mock_run: MagicMock) -> None: + mock_run.side_effect = FileNotFoundError() + assert _is_git_tracked(".dev.vars") is False + + +class TestDevVarsLoadGuard: + """main() refuses to load .dev.vars when git-tracked.""" + + def test_is_git_tracked_on_tracked_file(self) -> None: + """_is_git_tracked returns True for files in the git index.""" + repo_root = Path(__file__).resolve().parent.parent + tracked_file = repo_root / "pyproject.toml" + assert tracked_file.exists() + assert _is_git_tracked(str(tracked_file)) is True + + def test_is_git_tracked_on_untracked_file(self, tmp_path: Path) -> None: + """_is_git_tracked returns False for files not in the git index.""" + untracked = tmp_path / "not-in-git.txt" + untracked.write_text("test") + assert _is_git_tracked(str(untracked)) is False + + def test_is_git_tracked_on_nonexistent_file(self) -> None: + """_is_git_tracked returns False for files that don't exist.""" + assert _is_git_tracked("/nonexistent/path/.dev.vars") is False + + @patch("grippy.review.subprocess.run", side_effect=FileNotFoundError) + def test_is_git_tracked_git_unavailable(self, _mock: MagicMock) -> None: + """_is_git_tracked returns False (fail-open) when git is not available.""" + assert _is_git_tracked("/some/path") is False + + +class TestGitignoreContainsDevVars: + """Verify .gitignore contains security-sensitive patterns.""" + + def test_gitignore_has_dev_vars(self) -> None: + gitignore = Path(__file__).resolve().parent.parent / ".gitignore" + content = gitignore.read_text() + assert ".dev.vars" in content + + def test_gitignore_has_env_patterns(self) -> None: + gitignore = Path(__file__).resolve().parent.parent / ".gitignore" + content = gitignore.read_text() + assert ".env" in content