@@ -2865,7 +2865,16 @@ def test_workflow_python_file_execution_uses_only_allowlisted_paths(
28652865 run_contents = self ._extract_all_run_content (workflow_text )
28662866 assert run_contents , "No `run:` content extracted"
28672867
2868- py_exec_re = re .compile (r"\bpython3?\s+(\S+\.py)\b" )
2868+ # Use `(?=[\s;|&]|$)` token-boundary lookahead instead of `\b`
2869+ # after `.py`. The weaker `\b` matches between `y` and a
2870+ # following `.` (non-word), so `python3 /tmp/foo.py.bak`
2871+ # would match the `.py` prefix and the captured path would
2872+ # be `/tmp/foo.py` (in allowlist) — letting an attacker run
2873+ # `.py.bak`/`.py~`/`.pyx`/etc. under an allowlisted prefix.
2874+ # R5 fix for PR #436.
2875+ py_exec_re = re .compile (
2876+ r"\bpython3?\s+(\S+\.py)(?=[\s;|&]|$)" ,
2877+ )
28692878
28702879 violations = []
28712880 for label , content in run_contents :
@@ -2954,11 +2963,17 @@ def test_workflow_allowlisted_tmp_python_executions_have_base_sha_staging(
29542963 + re .escape (base_source )
29552964 + r'[ \t]+>[ \t]+'
29562965 + re .escape (tmp_path )
2957- + r'\b ' ,
2966+ + r'(?=[\s;|&]|$) ' ,
29582967 )
2959- # Pattern B: python execution of this tmp_path.
2968+ # Pattern B: python execution of this tmp_path. Use
2969+ # `(?=[\s;|&]|$)` token-boundary lookahead instead of `\b`
2970+ # for the same reason as the sibling test above
2971+ # (`\bfoo.py\b` matches as a prefix of `foo.py.bak`).
2972+ # R5 fix for PR #436.
29602973 py_exec_re = re .compile (
2961- r'\bpython3?[ \t]+' + re .escape (tmp_path ) + r'\b' ,
2974+ r'\bpython3?[ \t]+'
2975+ + re .escape (tmp_path )
2976+ + r'(?=[\s;|&]|$)' ,
29622977 )
29632978 # Pattern C: any non-staging write to tmp_path that would
29642979 # overwrite the BASE-staged content. Matches:
@@ -2967,15 +2982,20 @@ def test_workflow_allowlisted_tmp_python_executions_have_base_sha_staging(
29672982 # cp <src> <path>
29682983 # mv <src> <path>
29692984 # tee [-a] <path>
2985+ # ln [-s/-sf/-f] <src> <path> (symlink, R5 preempt)
29702986 # We exclude lines that match pattern A (the staging line
29712987 # itself uses `>`, which would otherwise self-flag).
2988+ # Token-boundary lookahead `(?=[\s;|&]|$)` (not `\b`)
2989+ # prevents `> /tmp/foo.py.bak` from matching as `> /tmp/foo.py`.
2990+ tail = r'(?=[\s;|&]|$)'
29722991 overwrite_re = re .compile (
29732992 r'(?:'
2974- r'>>?[ \t]*' + re .escape (tmp_path ) + r'\b'
2975- r'|cp[ \t]+\S+[ \t]+' + re .escape (tmp_path ) + r'\b'
2976- r'|mv[ \t]+\S+[ \t]+' + re .escape (tmp_path ) + r'\b'
2977- r'|tee[ \t]+(?:-a[ \t]+)?' + re .escape (tmp_path ) + r'\b'
2978- r')' ,
2993+ r'>>?[ \t]*' + re .escape (tmp_path ) + tail
2994+ + r'|cp[ \t]+\S+[ \t]+' + re .escape (tmp_path ) + tail
2995+ + r'|mv[ \t]+\S+[ \t]+' + re .escape (tmp_path ) + tail
2996+ + r'|tee[ \t]+(?:-[a-zA-Z]+[ \t]+)?' + re .escape (tmp_path ) + tail
2997+ + r'|ln[ \t]+(?:-[a-zA-Z]+[ \t]+)?\S+[ \t]+' + re .escape (tmp_path ) + tail
2998+ + r')' ,
29792999 )
29803000
29813001 staging_indices = []
@@ -3135,6 +3155,87 @@ def test_workflow_open_pr_checkout_uses_head_repo_and_head_sha(
31353155 "Found checkout block:\n " + checkout_block
31363156 )
31373157
3158+ def test_python_exec_regex_rejects_suffixed_paths (self ):
3159+ """Regression: `python3 /tmp/notebook_md_extract.py.bak` must
3160+ NOT be treated as the allowlisted `/tmp/notebook_md_extract.py`.
3161+ Likewise for `~`, `.swp`, `.pyc` and other suffixes. R5 fix
3162+ for PR #436: prior `\\ b` boundary matched between `y` and `.`
3163+ (non-word char), letting suffixed paths bypass the allowlist."""
3164+ import re
3165+
3166+ py_exec_re = re .compile (
3167+ r"\bpython3?\s+(\S+\.py)(?=[\s;|&]|$)" ,
3168+ )
3169+ # Should match (legit forms): captures the exact path.
3170+ for line , expected in [
3171+ ("python3 /tmp/notebook_md_extract.py" , "/tmp/notebook_md_extract.py" ),
3172+ ("python3 /tmp/notebook_md_extract.py --input foo" , "/tmp/notebook_md_extract.py" ),
3173+ ("python3 /tmp/notebook_md_extract.py;" , "/tmp/notebook_md_extract.py" ),
3174+ ("python3 /tmp/notebook_md_extract.py | tee log" , "/tmp/notebook_md_extract.py" ),
3175+ ("python /tmp/foo.py && echo done" , "/tmp/foo.py" ),
3176+ ]:
3177+ matches = py_exec_re .findall (line )
3178+ assert matches == [expected ], (
3179+ f"Expected { expected !r} captured from { line !r} , got { matches !r} "
3180+ )
3181+ # Should NOT match (suffix bypasses): regex must reject these.
3182+ for line in [
3183+ "python3 /tmp/notebook_md_extract.py.bak" ,
3184+ "python3 /tmp/notebook_md_extract.py~" ,
3185+ "python3 /tmp/notebook_md_extract.pyc" ,
3186+ "python3 /tmp/notebook_md_extract.pyx" ,
3187+ "python3 /tmp/foo.py.evil" ,
3188+ ]:
3189+ matches = py_exec_re .findall (line )
3190+ assert matches == [], (
3191+ f"Suffix-bypass { line !r} must NOT match the allowlist regex; "
3192+ f"got { matches !r} . R5 dismissal-test fix would regress."
3193+ )
3194+
3195+ def test_overwrite_regex_rejects_suffixed_paths (self ):
3196+ """Regression: an overwrite to `/tmp/foo.py.bak` must NOT be
3197+ treated as an overwrite to the allowlisted `/tmp/foo.py`. Used
3198+ to prevent the staging-vs-execution ordering check from being
3199+ confused by suffix-only writes (which do not actually corrupt
3200+ the BASE-staged file)."""
3201+ import re
3202+
3203+ tmp_path = "/tmp/notebook_md_extract.py"
3204+ tail = r"(?=[\s;|&]|$)"
3205+ overwrite_re = re .compile (
3206+ r"(?:"
3207+ r">>?[ \t]*" + re .escape (tmp_path ) + tail
3208+ + r"|cp[ \t]+\S+[ \t]+" + re .escape (tmp_path ) + tail
3209+ + r"|mv[ \t]+\S+[ \t]+" + re .escape (tmp_path ) + tail
3210+ + r"|tee[ \t]+(?:-[a-zA-Z]+[ \t]+)?" + re .escape (tmp_path ) + tail
3211+ + r"|ln[ \t]+(?:-[a-zA-Z]+[ \t]+)?\S+[ \t]+" + re .escape (tmp_path ) + tail
3212+ + r")" ,
3213+ )
3214+ # Should match (real overwrites of the allowlisted path):
3215+ for line in [
3216+ "echo x > /tmp/notebook_md_extract.py" ,
3217+ "cat foo >> /tmp/notebook_md_extract.py" ,
3218+ "cp src.py /tmp/notebook_md_extract.py" ,
3219+ "mv src.py /tmp/notebook_md_extract.py" ,
3220+ "echo x | tee /tmp/notebook_md_extract.py" ,
3221+ "echo x | tee -a /tmp/notebook_md_extract.py" ,
3222+ "ln -sf evil.py /tmp/notebook_md_extract.py" ,
3223+ "ln -s evil.py /tmp/notebook_md_extract.py" ,
3224+ ]:
3225+ assert overwrite_re .search (line ), (
3226+ f"Real overwrite { line !r} must match the overwrite regex"
3227+ )
3228+ # Should NOT match (suffix variants — different files):
3229+ for line in [
3230+ "echo x > /tmp/notebook_md_extract.py.bak" ,
3231+ "cp src.py /tmp/notebook_md_extract.py~" ,
3232+ "ln -sf evil.py /tmp/notebook_md_extract.pyc" ,
3233+ ]:
3234+ assert not overwrite_re .search (line ), (
3235+ f"Suffix-only write { line !r} must NOT match the overwrite "
3236+ f"regex (it does not corrupt the allowlisted exact path)"
3237+ )
3238+
31383239 def test_workflow_comment_triggers_require_author_association (
31393240 self , workflow_text
31403241 ):
0 commit comments