Skip to content

Retry Infrastructure Failures #153

Retry Infrastructure Failures

Retry Infrastructure Failures #153

name: Retry Infrastructure Failures
on:
schedule:
- cron: '43 * * * *'
workflow_dispatch:
permissions:
actions: write
pull-requests: write
jobs:
retry-failed:
runs-on: ubuntu-latest
steps:
- name: Retry PR runs failed due to infrastructure errors
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_REPO: ${{ github.repository }}
run: |
set -euo pipefail
MAX_RERUNS=10
rerun_count=0
cutoff=$(date -u -d '6 hours ago' +%Y-%m-%dT%H:%M:%SZ)
echo "Looking for failed PR workflow runs since $cutoff..."
echo ""
# Get recent failed runs for the PR workflow that have not been retried yet
run_ids=$(gh run list \
--repo "$GH_REPO" \
--workflow pull_request.yml \
--status failure \
--limit 50 \
--json databaseId,attempt,createdAt \
--jq ".[] | select(.attempt == 1 and .createdAt >= \"$cutoff\") | .databaseId")
if [ -z "$run_ids" ]; then
echo "No recent failed runs found."
exit 0
fi
for run_id in $run_ids; do
if [ "$rerun_count" -ge "$MAX_RERUNS" ]; then
echo "Reached maximum of $MAX_RERUNS reruns, stopping."
break
fi
run_url="https://github.com/$GH_REPO/actions/runs/$run_id"
echo "Checking run $run_url ..."
should_rerun=false
# Fetch all job data once (reused for multiple checks below)
jobs_raw=$(gh api "repos/$GH_REPO/actions/runs/$run_id/jobs?per_page=100" --paginate)
# Collect per-job verdicts across all pages (runs can have >100 jobs).
# Each failed job emits "true" (infrastructure) or "false" (real failure).
# A job is considered an infrastructure failure if:
# - it is the "Config Workflow" or "Finish Workflow" job (pipeline plumbing,
# not actual tests — always treated as infrastructure failure), or
# - it never reached its main "Run" step (failed during checkout/setup), or
# - the "Run" step was skipped, or
# - the "Run" step failed almost immediately (under 2 minutes), indicating
# a setup/download issue (e.g. missing S3 credentials) rather than a real
# test failure
verdicts=$(echo "$jobs_raw" | jq -r '
.jobs[] | select(.conclusion == "failure") |
if .name == "Config Workflow" or .name == "Finish Workflow" then true
else
[.steps[] | select(.name == "Run")] |
if length == 0 then true
elif .[0].conclusion == "skipped" then true
elif .[0].conclusion == "failure" then
((.[0].completed_at | fromdateiso8601) -
(.[0].started_at | fromdateiso8601)) < 120
else false
end
end
')
# Infrastructure failure = at least one failed job, and all of them are infra
if [ -z "$verdicts" ]; then
:
elif echo "$verdicts" | grep -q "false"; then
:
else
should_rerun=true
echo " Infrastructure failure detected (job-level heuristic)."
fi
# Fetch run metadata once for all checks below
run_data=$(gh api "repos/$GH_REPO/actions/runs/$run_id" \
--jq '{pr: .pull_requests[0].number, sha: .head_sha}')
pr_number=$(echo "$run_data" | jq -r '.pr // empty')
run_sha=$(echo "$run_data" | jq -r '.sha')
# If the job-level heuristic didn't trigger, check Praktika result JSONs
# on S3 for failures at the "Checkout Submodules" sub-step (e.g. transient
# DNS failures cloning git submodules).
if [ "$should_rerun" = "false" ] && [ -n "$verdicts" ] && [ -n "$pr_number" ] && [ -n "$run_sha" ]; then
# Get names of failed jobs (excluding pipeline plumbing)
failed_job_names=$(echo "$jobs_raw" | jq -r '
.jobs[] | select(.conclusion == "failure") |
select(.name != "Config Workflow" and .name != "Finish Workflow") |
.name
')
all_checkout_failures=true
while IFS= read -r job_name; do
[ -z "$job_name" ] && continue
# Normalize job name to match Praktika result file naming:
# lowercase, replace non-alphanumeric chars with underscore, collapse runs
normalized=$(echo "$job_name" | tr '[:upper:]' '[:lower:]' | \
sed 's/[^a-z0-9_]/_/g; s/__*/_/g')
result_url="https://s3.amazonaws.com/clickhouse-test-reports/PRs/${pr_number}/${run_sha}/result_${normalized}.json"
result_json=$(curl -sf --compressed "$result_url" 2>/dev/null || true)
if [ -z "$result_json" ]; then
all_checkout_failures=false
break
fi
# Check: all failed sub-results must be "Checkout Submodules"
has_non_checkout_failure=$(echo "$result_json" | jq -r '
[.results[] | select(.status == "failure" or .status == "error") |
.name] | map(select(. != "Checkout Submodules")) | length > 0
')
if [ "$has_non_checkout_failure" = "true" ]; then
all_checkout_failures=false
break
fi
done <<< "$failed_job_names"
if [ "$all_checkout_failures" = "true" ] && [ -n "$failed_job_names" ]; then
should_rerun=true
echo " Infrastructure failure detected (submodule checkout failure in Praktika results)."
fi
fi
# Check if "Config Workflow" failed in its "Run" step (e.g. due to
# pr_labels_and_category.py rejecting the changelog category). If the PR
# description was edited after the failure (same HEAD commit, so no new
# workflow run was triggered), re-run to pick up the fix.
if [ "$should_rerun" = "false" ]; then
config_failed_at=$(echo "$jobs_raw" | jq -r '
[.jobs[]
| select(.name == "Config Workflow" and .conclusion == "failure")
| .steps[] | select(.name == "Run" and .conclusion == "failure")
| .completed_at
] | first // empty
')
if [ -n "$config_failed_at" ] && [ -n "$pr_number" ]; then
pr_data=$(gh api "repos/$GH_REPO/pulls/$pr_number" \
--jq '{sha: .head.sha, updated: .updated_at}')
pr_sha=$(echo "$pr_data" | jq -r '.sha')
pr_updated=$(echo "$pr_data" | jq -r '.updated')
if [ "$run_sha" = "$pr_sha" ] && [[ "$pr_updated" > "$config_failed_at" ]]; then
should_rerun=true
echo " Config Workflow failed but PR #$pr_number was updated after — rerunning."
fi
fi
fi
if [ "$should_rerun" = "true" ]; then
if gh run rerun "$run_id" --repo "$GH_REPO"; then
rerun_count=$((rerun_count + 1))
echo " Triggered rerun: $run_url/attempts/2"
else
echo " Failed to trigger rerun (may already be rerunning)"
fi
else
echo " Not an infrastructure failure, skipping."
fi
echo ""
done
echo "Done. Triggered $rerun_count rerun(s)."
fix-stuck-auto-merge:
runs-on: ubuntu-latest
steps:
- name: Fix PRs stuck with auto-merge enabled but not queued
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_REPO: ${{ github.repository }}
run: |
set -euo pipefail
echo "Looking for PRs with auto-merge enabled but not queued..."
echo ""
now=$(date +%s)
cutoff_seconds=3600 # 60 minutes
# Get open PRs with auto-merge enabled that have mergeStateStatus CLEAN
# or UNKNOWN (checks passed but not queued into merge queue).
# UNKNOWN can happen when GitHub's merge status evaluation gets stuck.
# We use GraphQL pagination (20 PRs per page) to avoid 502 errors
# that occur when fetching too many PRs in a single request.
owner="${GH_REPO%%/*}"
repo_name="${GH_REPO##*/}"
cat > /tmp/query.graphql << 'QUERYEOF'
query($owner: String!, $repo: String!, $cursor: String) {
repository(owner: $owner, name: $repo) {
pullRequests(states: OPEN, first: 20, after: $cursor, orderBy: {field: UPDATED_AT, direction: DESC}) {
pageInfo { hasNextPage endCursor }
nodes {
number
isDraft
mergeStateStatus
autoMergeRequest { enabledAt }
}
}
}
}
QUERYEOF
stuck_prs='[]'
cursor_args=()
for i in $(seq 1 10); do
response=$(gh api graphql \
-f query="$(cat /tmp/query.graphql)" \
-f owner="$owner" \
-f repo="$repo_name" \
"${cursor_args[@]}")
filtered=$(echo "$response" | jq '[
.data.repository.pullRequests.nodes[] | select(
.autoMergeRequest != null and
(.mergeStateStatus == "CLEAN" or .mergeStateStatus == "UNKNOWN") and
.isDraft == false
) | {number, autoMergeRequest, mergeStateStatus, isDraft}
]')
stuck_prs=$(echo "$stuck_prs" "$filtered" | jq -s '.[0] + .[1]')
has_next=$(echo "$response" | jq -r '.data.repository.pullRequests.pageInfo.hasNextPage')
if [ "$has_next" != "true" ]; then
break
fi
cursor=$(echo "$response" | jq -r '.data.repository.pullRequests.pageInfo.endCursor')
cursor_args=(-f cursor="$cursor")
done
count=$(echo "$stuck_prs" | jq 'length')
if [ "$count" -eq 0 ]; then
echo "No stuck PRs found."
exit 0
fi
echo "Found $count candidate PR(s), checking age..."
echo ""
fixed=0
pr_numbers=$(echo "$stuck_prs" | jq -r '.[].number')
for pr_number in $pr_numbers; do
enabled_at=$(echo "$stuck_prs" | jq -r ".[] | select(.number == $pr_number) | .autoMergeRequest.enabledAt")
enabled_ts=$(date -d "$enabled_at" +%s 2>/dev/null || echo 0)
age_seconds=$((now - enabled_ts))
if [ "$age_seconds" -lt "$cutoff_seconds" ]; then
echo "PR #$pr_number: auto-merge enabled ${age_seconds}s ago (< ${cutoff_seconds}s), skipping."
continue
fi
age_minutes=$((age_seconds / 60))
echo "PR #$pr_number: stuck for ${age_minutes} minutes, retoggling auto-merge..."
if gh pr merge "$pr_number" --disable-auto --repo "$GH_REPO"; then
sleep 2
if gh pr merge "$pr_number" --auto --repo "$GH_REPO"; then
echo " OK: Auto-merge retoggled for PR #$pr_number"
fixed=$((fixed + 1))
else
echo " ERROR: Failed to re-enable auto-merge for PR #$pr_number"
fi
else
echo " ERROR: Failed to disable auto-merge for PR #$pr_number"
fi
echo ""
done
echo "Done. Fixed $fixed stuck PR(s)."