Skip to content

Commit 699537c

Browse files
committed
Address PR #436 R5 P1: token-boundary py-exec match + ln-symlink overwrite + regression tests
Reviewer flagged: both python-execution regexes used `\b` after `.py`, which matches between `y` (word) and a following `.` (non-word). So `python3 /tmp/notebook_md_extract.py.bak` matched as `/tmp/notebook_md_extract.py`, and the captured path was treated as the allowlisted exact path. Same class of bypass for `.py~`, `.pyc`, `.pyx`, `.py.evil`. Fixes: - Replace `\b` after `.py` and after `tmp_path` with `(?=[\s;|&]|$)` shell-token-boundary lookahead in: - test_workflow_python_file_execution_uses_only_allowlisted_paths (the allowlist check) - test_workflow_allowlisted_tmp_python_executions_have_base_sha_staging (staging_re, py_exec_re, overwrite_re — all three) - Add `ln`-based overwrite patterns to overwrite_re (`ln -s`, `ln -sf`, `ln -f`, etc.). Preempts a future `ln -sf evil.py /tmp/<path>` symlink attack between staging and execution. - Add 2 new regression tests: - test_python_exec_regex_rejects_suffixed_paths: asserts legit forms (with arg, with ;, with |, with &&, end-of-line) match correctly AND suffix bypasses (`.py.bak`, `.py~`, `.pyc`, `.pyx`, `.py.evil`) are rejected. - test_overwrite_regex_rejects_suffixed_paths: asserts real overwrites (>, >>, cp, mv, tee [-a], ln -s/-sf) match AND suffix-only writes do NOT match (they don't corrupt the allowlisted exact path). All 10 guard tests pass. Test count grew from 8 to 10.
1 parent 8e4006c commit 699537c

1 file changed

Lines changed: 110 additions & 9 deletions

File tree

tests/test_openai_review.py

Lines changed: 110 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2865,7 +2865,16 @@ def test_workflow_python_file_execution_uses_only_allowlisted_paths(
28652865
run_contents = self._extract_all_run_content(workflow_text)
28662866
assert run_contents, "No `run:` content extracted"
28672867

2868-
py_exec_re = re.compile(r"\bpython3?\s+(\S+\.py)\b")
2868+
# Use `(?=[\s;|&]|$)` token-boundary lookahead instead of `\b`
2869+
# after `.py`. The weaker `\b` matches between `y` and a
2870+
# following `.` (non-word), so `python3 /tmp/foo.py.bak`
2871+
# would match the `.py` prefix and the captured path would
2872+
# be `/tmp/foo.py` (in allowlist) — letting an attacker run
2873+
# `.py.bak`/`.py~`/`.pyx`/etc. under an allowlisted prefix.
2874+
# R5 fix for PR #436.
2875+
py_exec_re = re.compile(
2876+
r"\bpython3?\s+(\S+\.py)(?=[\s;|&]|$)",
2877+
)
28692878

28702879
violations = []
28712880
for label, content in run_contents:
@@ -2954,11 +2963,17 @@ def test_workflow_allowlisted_tmp_python_executions_have_base_sha_staging(
29542963
+ re.escape(base_source)
29552964
+ r'[ \t]+>[ \t]+'
29562965
+ re.escape(tmp_path)
2957-
+ r'\b',
2966+
+ r'(?=[\s;|&]|$)',
29582967
)
2959-
# Pattern B: python execution of this tmp_path.
2968+
# Pattern B: python execution of this tmp_path. Use
2969+
# `(?=[\s;|&]|$)` token-boundary lookahead instead of `\b`
2970+
# for the same reason as the sibling test above
2971+
# (`\bfoo.py\b` matches as a prefix of `foo.py.bak`).
2972+
# R5 fix for PR #436.
29602973
py_exec_re = re.compile(
2961-
r'\bpython3?[ \t]+' + re.escape(tmp_path) + r'\b',
2974+
r'\bpython3?[ \t]+'
2975+
+ re.escape(tmp_path)
2976+
+ r'(?=[\s;|&]|$)',
29622977
)
29632978
# Pattern C: any non-staging write to tmp_path that would
29642979
# overwrite the BASE-staged content. Matches:
@@ -2967,15 +2982,20 @@ def test_workflow_allowlisted_tmp_python_executions_have_base_sha_staging(
29672982
# cp <src> <path>
29682983
# mv <src> <path>
29692984
# tee [-a] <path>
2985+
# ln [-s/-sf/-f] <src> <path> (symlink, R5 preempt)
29702986
# We exclude lines that match pattern A (the staging line
29712987
# itself uses `>`, which would otherwise self-flag).
2988+
# Token-boundary lookahead `(?=[\s;|&]|$)` (not `\b`)
2989+
# prevents `> /tmp/foo.py.bak` from matching as `> /tmp/foo.py`.
2990+
tail = r'(?=[\s;|&]|$)'
29722991
overwrite_re = re.compile(
29732992
r'(?:'
2974-
r'>>?[ \t]*' + re.escape(tmp_path) + r'\b'
2975-
r'|cp[ \t]+\S+[ \t]+' + re.escape(tmp_path) + r'\b'
2976-
r'|mv[ \t]+\S+[ \t]+' + re.escape(tmp_path) + r'\b'
2977-
r'|tee[ \t]+(?:-a[ \t]+)?' + re.escape(tmp_path) + r'\b'
2978-
r')',
2993+
r'>>?[ \t]*' + re.escape(tmp_path) + tail
2994+
+ r'|cp[ \t]+\S+[ \t]+' + re.escape(tmp_path) + tail
2995+
+ r'|mv[ \t]+\S+[ \t]+' + re.escape(tmp_path) + tail
2996+
+ r'|tee[ \t]+(?:-[a-zA-Z]+[ \t]+)?' + re.escape(tmp_path) + tail
2997+
+ r'|ln[ \t]+(?:-[a-zA-Z]+[ \t]+)?\S+[ \t]+' + re.escape(tmp_path) + tail
2998+
+ r')',
29792999
)
29803000

29813001
staging_indices = []
@@ -3135,6 +3155,87 @@ def test_workflow_open_pr_checkout_uses_head_repo_and_head_sha(
31353155
"Found checkout block:\n" + checkout_block
31363156
)
31373157

3158+
def test_python_exec_regex_rejects_suffixed_paths(self):
3159+
"""Regression: `python3 /tmp/notebook_md_extract.py.bak` must
3160+
NOT be treated as the allowlisted `/tmp/notebook_md_extract.py`.
3161+
Likewise for `~`, `.swp`, `.pyc` and other suffixes. R5 fix
3162+
for PR #436: prior `\\b` boundary matched between `y` and `.`
3163+
(non-word char), letting suffixed paths bypass the allowlist."""
3164+
import re
3165+
3166+
py_exec_re = re.compile(
3167+
r"\bpython3?\s+(\S+\.py)(?=[\s;|&]|$)",
3168+
)
3169+
# Should match (legit forms): captures the exact path.
3170+
for line, expected in [
3171+
("python3 /tmp/notebook_md_extract.py", "/tmp/notebook_md_extract.py"),
3172+
("python3 /tmp/notebook_md_extract.py --input foo", "/tmp/notebook_md_extract.py"),
3173+
("python3 /tmp/notebook_md_extract.py;", "/tmp/notebook_md_extract.py"),
3174+
("python3 /tmp/notebook_md_extract.py | tee log", "/tmp/notebook_md_extract.py"),
3175+
("python /tmp/foo.py && echo done", "/tmp/foo.py"),
3176+
]:
3177+
matches = py_exec_re.findall(line)
3178+
assert matches == [expected], (
3179+
f"Expected {expected!r} captured from {line!r}, got {matches!r}"
3180+
)
3181+
# Should NOT match (suffix bypasses): regex must reject these.
3182+
for line in [
3183+
"python3 /tmp/notebook_md_extract.py.bak",
3184+
"python3 /tmp/notebook_md_extract.py~",
3185+
"python3 /tmp/notebook_md_extract.pyc",
3186+
"python3 /tmp/notebook_md_extract.pyx",
3187+
"python3 /tmp/foo.py.evil",
3188+
]:
3189+
matches = py_exec_re.findall(line)
3190+
assert matches == [], (
3191+
f"Suffix-bypass {line!r} must NOT match the allowlist regex; "
3192+
f"got {matches!r}. R5 dismissal-test fix would regress."
3193+
)
3194+
3195+
def test_overwrite_regex_rejects_suffixed_paths(self):
3196+
"""Regression: an overwrite to `/tmp/foo.py.bak` must NOT be
3197+
treated as an overwrite to the allowlisted `/tmp/foo.py`. Used
3198+
to prevent the staging-vs-execution ordering check from being
3199+
confused by suffix-only writes (which do not actually corrupt
3200+
the BASE-staged file)."""
3201+
import re
3202+
3203+
tmp_path = "/tmp/notebook_md_extract.py"
3204+
tail = r"(?=[\s;|&]|$)"
3205+
overwrite_re = re.compile(
3206+
r"(?:"
3207+
r">>?[ \t]*" + re.escape(tmp_path) + tail
3208+
+ r"|cp[ \t]+\S+[ \t]+" + re.escape(tmp_path) + tail
3209+
+ r"|mv[ \t]+\S+[ \t]+" + re.escape(tmp_path) + tail
3210+
+ r"|tee[ \t]+(?:-[a-zA-Z]+[ \t]+)?" + re.escape(tmp_path) + tail
3211+
+ r"|ln[ \t]+(?:-[a-zA-Z]+[ \t]+)?\S+[ \t]+" + re.escape(tmp_path) + tail
3212+
+ r")",
3213+
)
3214+
# Should match (real overwrites of the allowlisted path):
3215+
for line in [
3216+
"echo x > /tmp/notebook_md_extract.py",
3217+
"cat foo >> /tmp/notebook_md_extract.py",
3218+
"cp src.py /tmp/notebook_md_extract.py",
3219+
"mv src.py /tmp/notebook_md_extract.py",
3220+
"echo x | tee /tmp/notebook_md_extract.py",
3221+
"echo x | tee -a /tmp/notebook_md_extract.py",
3222+
"ln -sf evil.py /tmp/notebook_md_extract.py",
3223+
"ln -s evil.py /tmp/notebook_md_extract.py",
3224+
]:
3225+
assert overwrite_re.search(line), (
3226+
f"Real overwrite {line!r} must match the overwrite regex"
3227+
)
3228+
# Should NOT match (suffix variants — different files):
3229+
for line in [
3230+
"echo x > /tmp/notebook_md_extract.py.bak",
3231+
"cp src.py /tmp/notebook_md_extract.py~",
3232+
"ln -sf evil.py /tmp/notebook_md_extract.pyc",
3233+
]:
3234+
assert not overwrite_re.search(line), (
3235+
f"Suffix-only write {line!r} must NOT match the overwrite "
3236+
f"regex (it does not corrupt the allowlisted exact path)"
3237+
)
3238+
31383239
def test_workflow_comment_triggers_require_author_association(
31393240
self, workflow_text
31403241
):

0 commit comments

Comments
 (0)