Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ build/
coverage.xml
test-results.xml
.dev.vars
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔵 LOW: Added .env patterns to .gitignore

Confidence: 80%

.env and .env.* added to .gitignore.

.env and .env.* are now ignored, preventing accidental commit of sensitive runtime or dev configuration. This is good baseline hygiene.

Suggestion: Verify that no existing .env files are tracked by git. No other action needed.

— Observable code is debuggable code. This is a minor, but important, governance win.

.env
.env.*
.claude/
.claude-flow/
.serena/
Expand Down
10 changes: 5 additions & 5 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@
"filename": "tests/test_grippy_codebase.py",
"hashed_secret": "3acfb2c2b433c0ea7ff107e33df91b18e52f960f",
"is_verified": false,
"line_number": 561
"line_number": 562
}
],
"tests/test_grippy_embedder.py": [
Expand Down Expand Up @@ -254,21 +254,21 @@
"filename": "tests/test_grippy_review.py",
"hashed_secret": "3e536cc49ad17f2f50dc6f0aab08495bddbb2833",
"is_verified": false,
"line_number": 906
"line_number": 907
},
{
"type": "Secret Keyword",
"filename": "tests/test_grippy_review.py",
"hashed_secret": "80c3eb3a746f82974a9696275d8b52a37fba449b",
"is_verified": false,
"line_number": 938
"line_number": 939
},
{
"type": "AWS Access Key",
"filename": "tests/test_grippy_review.py",
"hashed_secret": "1d5bc0e7232d07b336f4d35db4c0200142962f4a",
"is_verified": false,
"line_number": 1463
"line_number": 1464
}
],
"tests/test_grippy_rule_secrets.py": [
Expand All @@ -295,5 +295,5 @@
}
]
},
"generated_at": "2026-03-15T06:39:33Z"
"generated_at": "2026-04-03T17:34:56Z"
}
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ Graph enrichment: `enrich_results()` (rules/enrichment.py) post-processes findin
### Codebase Tools (`codebase.py`) — LLM-facing, security-critical

- Path traversal: `Path.is_relative_to()` (not `startswith`)
- Symlink: `grep_code` uses `-S` flag (no follow)
- Symlink: `grep_code` uses `-r` (not `-R`), which does not follow symlinks on GNU grep. BSD grep `-r` does follow symlinks; `-S` is BSD-only and not used. See DEBT-INT-009.
- Glob timeout: 5-second `time.monotonic()` deadline
- Result caps: 5,000 files indexed, 500 glob results, 12,000 chars per tool response
- Sanitization: `tool_hooks` middleware applies `navi_sanitize.clean()` + XML-escape + 12K truncation to all tool outputs before LLM sees them
Expand Down
33 changes: 31 additions & 2 deletions src/grippy/codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,27 @@ def grep_code(pattern: str, glob: str = "*.py", context_lines: int = 2) -> str:
return grep_code
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 HIGH: Defense-in-depth symlink rejection on file and directory access

Confidence: 95%

New _reject_symlinks() is called before Path.resolve() in both read_file and list_files. Test coverage was added for various traversal scenarios.

A new _reject_symlinks() function checks each path component for symlinks before resolving the path in both read_file and list_files functions. This prevents symlink-based directory traversal attacks that could expose files outside the repository root, closing a critical path traversal risk.

Suggestion: No further action required on the implemented symlink rejection logic. Maintain test coverage and ensure this pattern is used wherever user-supplied paths are handled.

— This is how breaches start. I'm grudgingly satisfied with this explicit check.



def _reject_symlinks(path: Path, repo_root: Path) -> str | None:
"""Check if any component of *path* is a symlink.

Defense-in-depth: rejects symlinks *before* resolve() so a symlink
inside the repo that resolves to an external target cannot bypass the
is_relative_to() guard.

Returns an error message string if a symlink is found, or None if safe.
"""
try:
relative = path.relative_to(repo_root)
except ValueError:
return "Error: path outside repo root."
current = repo_root
for part in relative.parts:
current = current / part
if current.is_symlink():
return "Error: symlinks not allowed."
Comment on lines +750 to +751
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Permit safe symlinks that resolve inside the repository

_reject_symlinks now rejects any symlinked path component unconditionally, so read_file and list_files fail on repos that use legitimate in-repo symlinks (for example, versioned docs or generated source links). This is a functional regression from the prior resolve()+is_relative_to() behavior, which allowed symlinks whose resolved targets remained under repo_root; the current check blocks safe paths and can hide valid files from review.

Useful? React with 👍 / 👎.

return None


def _make_read_file(repo_root: Path) -> Any:
"""Create a read_file tool function bound to a repo root."""

Expand All @@ -742,7 +763,11 @@ def read_file(path: str, start_line: int = 0, end_line: int = 0) -> str:
:param end_line: last line to read (1-based, 0 = to end)
"""
target = repo_root / path
# Prevent path traversal
# Layer 1: reject symlinks before resolve()
symlink_err = _reject_symlinks(target, repo_root)
if symlink_err is not None:
return symlink_err
# Layer 2: resolve + is_relative_to (catches traversal via ..)
try:
target = target.resolve()
if not target.is_relative_to(repo_root.resolve()):
Expand Down Expand Up @@ -795,7 +820,11 @@ def list_files(path: str = ".", glob_pattern: str = "*") -> str:
:param glob_pattern: glob pattern to filter files (default "*")
"""
target = repo_root / path
# Prevent path traversal
# Layer 1: reject symlinks before resolve()
symlink_err = _reject_symlinks(target, repo_root)
if symlink_err is not None:
return symlink_err
# Layer 2: resolve + is_relative_to (catches traversal via ..)
try:
target = target.resolve()
if not target.is_relative_to(repo_root.resolve()):
Expand Down
21 changes: 21 additions & 0 deletions src/grippy/github_review.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,16 @@ def _sanitize_comment_text(text: str) -> str:
# Strip markdown images (tracking pixels) and external links (phishing)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 MEDIUM: Reference-style markdown link stripping in comment sanitization

Confidence: 90%

Additional regex passes before and after the URL decode loop. Comprehensive tests for all main bypass scenarios.

The _sanitize_comment_text function now removes reference-style markdown links and bare autolinks, closing an attack vector where malicious links might evade detection/stripping via references or encoded forms.

Suggestion: The sanitization logic is now robust for reference-style markdown links. Continue to monitor for any markdown parsing bypasses, especially around edge cases or non-standard syntax.

— Input validation isn't optional. This blocks a real and subtle risk.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 MEDIUM: Expanded MarkDown Sanitization to Cover Reference Links and Autolinks

Confidence: 95%

Added regex filters for reference-style links, collapsed references, and additional URL defanging in the sanitization pass.

The _sanitize_comment_text function now strips reference-style link definitions, reference link usages, and defangs bare autolinks before and after URL decoding, mitigating potential abuse of GitHub comment rendering. This is a defense-in-depth improvement against XSS/phishing vectors.

Suggestion: Continue to monitor for Markdown edge cases and test with community-submitted exploit attempts, since Markdown rendering has a long history of bypass tricks. Periodic reviews are advised as GitHub's parser evolves.

— This should have been in from the start. At least it's here now.

text = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", text)
text = re.sub(r"\[([^\]]*)\]\(https?://[^)]+\)", r"\1", text)
# Strip reference-style link definitions: [id]: url (optional title)
text = re.sub(r"^\s{0,3}\[[^\]]+\]:\s+\S[^\n]*$", "", text, flags=re.MULTILINE)
# Strip reference-style link references: [text][id] and collapsed [text][]
text = re.sub(r"\[([^\]]*)\]\[[^\]]*\]", r"\1", text)
# Defang bare URL autolinks: <https://...> → hxxps://... (not clickable on GitHub)
text = re.sub(
r"<https?://[^>]+>",
lambda m: m.group(0)[1:-1].replace("https://", "hxxps://").replace("http://", "hxxp://"),
text,
)
# Loop unquote until stable — prevents multi-layer URL encoding bypass
# (e.g., %2561 -> %61 -> a) that could smuggle dangerous schemes past
# the regex check.
Expand All @@ -174,7 +184,18 @@ def _sanitize_comment_text(text: str) -> str:
# URL-encoded to bypass the earlier stripping pass.
text = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", text)
text = re.sub(r"\[([^\]]*)\]\(https?://[^)]+\)", r"\1", text)
text = re.sub(r"^\s{0,3}\[[^\]]+\]:\s+\S[^\n]*$", "", text, flags=re.MULTILINE)
text = re.sub(r"\[([^\]]*)\]\[[^\]]*\]", r"\1", text)
# Defang autolinks after decode too
text = re.sub(
r"<https?://[^>]+>",
lambda m: m.group(0)[1:-1].replace("https://", "hxxps://").replace("http://", "hxxp://"),
text,
)
text = _DANGEROUS_SCHEME_RE.sub("", text)
# Defang bare URLs that survived all prior stripping — GitHub auto-linkifies these
text = re.sub(r"https://", "hxxps://", text)
text = re.sub(r"http://", "hxxp://", text)
return text


Expand Down
43 changes: 38 additions & 5 deletions src/grippy/review.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@

import itertools
import json
import logging
import os
import subprocess
import sys
from collections import Counter
from collections.abc import Callable
Expand All @@ -42,10 +44,35 @@
from grippy.rules import RuleResult, RuleSeverity, check_gate, load_profile, run_rules
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 HIGH: .dev.vars is now blocked from loading when git-tracked

Confidence: 95%

main() will not load .dev.vars if _is_git_tracked returns True. Test suite checks both tracked and untracked loading, and asserts correct env handling.

The main() logic now checks if .dev.vars is tracked by git using _is_git_tracked(), and refuses to load secrets from a tracked .dev.vars file. This prevents attackers from exfiltrating env values via malicious PRs. .env patterns are also added to .gitignore.

Suggestion: No further action needed; guard is correct and has comprehensive test coverage. Continue to verify that new ways of loading secrets are not introduced.

— Secrets and version control are a dangerous combo. This guard is overdue, but effective.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 MEDIUM: Git subprocess now set with explicit cwd for robustness

Confidence: 90%

subprocess.run in _is_git_tracked uses cwd=Path(path).parent. Issue addressed in commit, test coverage for various scenarios.

The git ls-files call in _is_git_tracked now sets cwd to the file's containing directory, preventing overrides due to the parent process's unexpected working directory. This closes a possible bypass where git would ignore the relevant repo.

Suggestion: Maintain the use of explicit cwd on subprocesses interacting with VCS, especially in environments with ambiguous process launch locations.

— Explicit is better than implicit; subprocess cwd was a subtle gap.

from grippy.rules.enrichment import enrich_results, persist_rule_findings

log = logging.getLogger(__name__)

# Max diff size sent to the LLM — configurable for local models with smaller context
MAX_DIFF_CHARS = int(os.environ.get("GRIPPY_MAX_DIFF_CHARS", "500000"))


def _is_git_tracked(path: str) -> bool:
"""Check if a file is tracked by git. Returns True if tracked.

Fail-open: returns False if git is unavailable or times out.
This means a tracked .dev.vars loads in that case — acceptable
trade-off for local dev usability. CI environments are excluded
by a separate check before this function is called.
"""
try:
file_path = Path(path)
repo_dir = file_path.parent
# Use relative path — git ls-files expects paths relative to cwd/repo root
result = subprocess.run(
["git", "ls-files", "--error-unmatch", file_path.name],
capture_output=True,
timeout=5,
cwd=str(repo_dir),
)
return result.returncode == 0
except (subprocess.TimeoutExpired, FileNotFoundError):
Comment on lines +65 to +72
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_is_git_tracked is called with an absolute path from main() (dev_vars_path is resolved). git ls-files --error-unmatch generally expects a pathspec relative to the repo (or current working directory), so passing an absolute path will typically return non-zero even when the file is tracked, defeating the .dev.vars guard. Consider deriving a relative pathspec (e.g., relpath to cwd) and/or running git with cwd set to the repo root and passing just the relative file path.

Suggested change
result = subprocess.run(
["git", "ls-files", "--error-unmatch", path],
capture_output=True,
timeout=5,
cwd=Path(path).parent, # ensure git operates in the file's repo
)
return result.returncode == 0
except (subprocess.TimeoutExpired, FileNotFoundError):
file_path = Path(path).resolve()
parent_dir = file_path.parent
repo_root_result = subprocess.run(
["git", "rev-parse", "--show-toplevel"],
capture_output=True,
text=True,
timeout=5,
cwd=parent_dir,
)
if repo_root_result.returncode != 0:
return False
repo_root = Path(repo_root_result.stdout.strip()).resolve()
relative_path = os.path.relpath(file_path, repo_root)
result = subprocess.run(
["git", "ls-files", "--error-unmatch", relative_path],
capture_output=True,
timeout=5,
cwd=repo_root,
)
return result.returncode == 0
except (subprocess.TimeoutExpired, FileNotFoundError, ValueError):

Copilot uses AI. Check for mistakes.
return False


_ERROR_HINTS: dict[str, str] = {
"CONFIG ERROR": "Valid `GRIPPY_TRANSPORT` values: `openai`, `anthropic`, `google`, `groq`, `mistral`, `local`.",
"TIMEOUT": "Increase `GRIPPY_TIMEOUT` or reduce PR diff size.",
Expand Down Expand Up @@ -286,11 +313,17 @@ def main(*, profile: str | None = None) -> None:
if not os.environ.get("CI"):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 MEDIUM: .dev.vars Environment File Loading Now Blocked if Git-Tracked

Confidence: 90%

Added `.is_git_tracked()` check before loading `.dev.vars` and `.env` patterns to `.gitignore`.

.dev.vars is only loaded if it is not tracked in git, preventing attackers from injecting sensitive configuration files via PRs and exfiltrating via CI. This is enforced with an explicit tracked file refusal and gitignore hardening.

Suggestion: Review other environment or config loading code for similar checks. Confirm that any such files are never referenced directly in CI unless verified safe.

— Catching git-tracked leaks preemptively will save headaches later.

dev_vars_path = Path(__file__).resolve().parent.parent.parent / ".dev.vars"
if dev_vars_path.is_file():
for line in dev_vars_path.read_text().splitlines():
line = line.strip()
if line and not line.startswith("#") and "=" in line:
key, _, value = line.partition("=")
os.environ.setdefault(key.strip(), value.strip())
if _is_git_tracked(str(dev_vars_path)):
log.warning(
".dev.vars is tracked by git — refusing to load. "
"Remove it from tracking: git rm --cached .dev.vars"
)
else:
for line in dev_vars_path.read_text().splitlines():
line = line.strip()
if line and not line.startswith("#") and "=" in line:
key, _, value = line.partition("=")
os.environ.setdefault(key.strip(), value.strip())

# Required env
token = os.environ.get("GITHUB_TOKEN", "")
Expand Down
134 changes: 134 additions & 0 deletions tests/test_grippy_codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
_make_list_files,
_make_read_file,
_make_search_code,
_reject_symlinks,
_write_manifest,
chunk_file,
sanitize_tool_hook,
Expand Down Expand Up @@ -1480,3 +1481,136 @@ def test_payload_non_dict_after_json_parse(self) -> None:
row = {"payload": json.dumps([1, 2, 3])} # list, not dict
results = CodebaseIndex._parse_results_static([row])
assert results == []


# --- Symlink rejection tests ---


class TestRejectSymlinks:
"""_reject_symlinks prevents symlink-based path traversal before resolve()."""

def test_rejects_symlink_file_to_outside(self, tmp_path: Path) -> None:
"""Symlink file pointing outside repo root is rejected."""
repo = tmp_path / "repo"
repo.mkdir()
outside = tmp_path / "secret.txt"
outside.write_text("stolen")
link = repo / "evil.txt"
link.symlink_to(outside)

result = _reject_symlinks(repo / "evil.txt", repo)
assert result is not None
assert "symlink" in result.lower()

def test_rejects_symlink_directory_to_outside(self, tmp_path: Path) -> None:
"""Symlink directory pointing outside repo root is rejected."""
repo = tmp_path / "repo"
repo.mkdir()
outside_dir = tmp_path / "outside_dir"
outside_dir.mkdir()
(outside_dir / "secret.py").write_text("stolen")
link = repo / "evil_dir"
link.symlink_to(outside_dir)

result = _reject_symlinks(repo / "evil_dir", repo)
assert result is not None
assert "symlink" in result.lower()

def test_allows_regular_file(self, tmp_path: Path) -> None:
"""Regular file inside repo returns None (no error)."""
repo = tmp_path / "repo"
repo.mkdir()
(repo / "safe.py").write_text("safe")

result = _reject_symlinks(repo / "safe.py", repo)
assert result is None

def test_allows_nested_path_without_symlinks(self, tmp_path: Path) -> None:
"""Nested path with no symlinks in any component returns None."""
repo = tmp_path / "repo"
(repo / "src" / "pkg").mkdir(parents=True)
(repo / "src" / "pkg" / "mod.py").write_text("code")

result = _reject_symlinks(repo / "src" / "pkg" / "mod.py", repo)
assert result is None

def test_rejects_symlink_in_intermediate_component(self, tmp_path: Path) -> None:
"""Symlink in a middle path component is detected."""
repo = tmp_path / "repo"
repo.mkdir()
outside_dir = tmp_path / "outside"
outside_dir.mkdir()
(outside_dir / "secret.py").write_text("stolen")
link = repo / "linked_dir"
link.symlink_to(outside_dir)

result = _reject_symlinks(repo / "linked_dir" / "secret.py", repo)
assert result is not None
assert "symlink" in result.lower()

def test_rejects_path_outside_repo_root(self, tmp_path: Path) -> None:
"""Path that is not relative to repo root is rejected."""
repo = tmp_path / "repo"
repo.mkdir()
outside = tmp_path / "other" / "file.py"

result = _reject_symlinks(outside, repo)
assert result is not None
assert "outside" in result.lower()


class TestSymlinkDefenseInReadFile:
"""read_file rejects symlinks via _reject_symlinks before resolve()."""

def test_read_file_rejects_symlink_to_outside(self, tmp_path: Path) -> None:
"""read_file returns error for symlink pointing outside repo."""
repo = tmp_path / "repo"
repo.mkdir()
(repo / "safe.py").write_text("safe")
outside = tmp_path / "secret.py"
outside.write_text("stolen data")
link = repo / "evil_link.py"
link.symlink_to(outside)

read_fn = _make_read_file(repo)
result = read_fn("evil_link.py")
assert "symlink" in result.lower() or "not allowed" in result.lower()

def test_read_file_allows_regular_file(self, tmp_path: Path) -> None:
"""read_file works for normal files after symlink check."""
repo = tmp_path / "repo"
repo.mkdir()
(repo / "safe.py").write_text("safe content here")

read_fn = _make_read_file(repo)
result = read_fn("safe.py")
assert "safe content here" in result


class TestSymlinkDefenseInListFiles:
"""list_files rejects symlinks via _reject_symlinks before resolve()."""

def test_list_files_rejects_symlink_directory(self, tmp_path: Path) -> None:
"""list_files returns error for symlinked directory pointing outside."""
repo = tmp_path / "repo"
repo.mkdir()
(repo / "safe.py").write_text("safe")
outside_dir = tmp_path / "outside"
outside_dir.mkdir()
(outside_dir / "secret.py").write_text("stolen")
link = repo / "evil_dir"
link.symlink_to(outside_dir)

list_fn = _make_list_files(repo)
result = list_fn("evil_dir")
assert "symlink" in result.lower() or "not allowed" in result.lower()

def test_list_files_allows_regular_directory(self, tmp_path: Path) -> None:
"""list_files works for normal directories after symlink check."""
repo = tmp_path / "repo"
(repo / "src").mkdir(parents=True)
(repo / "src" / "main.py").write_text("code")

list_fn = _make_list_files(repo)
result = list_fn("src")
assert "main.py" in result
Loading
Loading