marathon/test_retry.sh at develop · gusgw/marathon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#!/bin/bash
# test_retry.sh - Test Marathon's retry mechanism with exponential backoff
#
# DESCRIPTION:
#   This script comprehensively tests Marathon's retry functionality, including
#   exponential backoff, retry policies, error classification, and retry
#   metrics. It simulates various failure scenarios to ensure the retry
#   mechanism behaves correctly in production environments.
#
# USAGE:
#   ./test_retry.sh
#
# WHAT IT TESTS:
#   1. Command retry with eventual success - Verifies retry until success
#   2. Retry exhaustion - Tests max retry limit enforcement
#   3. Non-retryable errors - Ensures certain errors stop retries
#   4. Exponential backoff timing - Validates increasing delay intervals
#   5. Retry policy configurations - Tests critical/batch/normal policies
#   6. Rclone retry wrapper - Special handling for transfer operations
#   7. Retry metrics recording - Validates retry statistics collection
#   8. Error code classification - Tests retryable vs non-retryable codes
#
# EXPECTED OUTCOMES:
#   - Commands succeed after configured number of retries
#   - Non-retryable errors stop immediately (exit codes 2, 126, etc)
#   - Retryable errors trigger backoff (network errors, timeouts)
#   - Backoff delays double each time (2s, 4s, 8s, etc)
#   - Critical policy: 10 retries, 5s initial delay
#   - Batch policy: 5 retries, 30s initial delay
#   - Normal policy: 3 retries, 10s initial delay
#   - Retry metrics are recorded in CSV format
#
# SPECIAL REQUIREMENTS:
#   - Write access to /tmp for test logs and mock files
#   - retry.sh must be in the current directory
#   - bump utilities must be available in bump/
#   - date command for timing measurements
#
# NOTES:
#   - Uses mock commands to simulate failures deterministically
#   - Creates temporary test directory cleaned up on exit
#   - Fast retry delays used for quick test execution
#   - All output color-coded for easy interpretation
#   - Requires set_stamp to be called after sourcing bump.sh

# Don't use set -e as tests may return non-zero exit codes
set -o pipefail

# Source bump utilities first
export run_path=$(dirname $(realpath $0))
. ${run_path}/bump/bump.sh

# Set STAMP for logging functions
set_stamp

# Source retry mechanism
. ./retry.sh

# Test configuration
export TEST_LOG="/tmp/retry_test_$$.log"
export RETRY_TEST_DIR="/tmp/retry_test_$$"
mkdir -p "$RETRY_TEST_DIR"

# Colors
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

# Counter for simulated failures
ATTEMPT_COUNT=0

echo -e "${BLUE}Marathon Retry Mechanism Test${NC}"
echo -e "${BLUE}============================${NC}"
echo

# Test 1: Command succeeds after retries
test_retry_success() {
    echo -e "${YELLOW}Test 1: Command succeeds after 2 retries${NC}"

    ATTEMPT_COUNT=0

    # Command that fails twice then succeeds
    test_command_success() {
        ((ATTEMPT_COUNT++))
        echo "Attempt $ATTEMPT_COUNT" >> "$TEST_LOG"

        if [[ $ATTEMPT_COUNT -lt 3 ]]; then
            echo "  - Attempt $ATTEMPT_COUNT: Simulating failure"
            return 83  # NETWORK_ERROR - retryable
        else
            echo "  - Attempt $ATTEMPT_COUNT: Success!"
            return 0
        fi
    }

    # Configure fast retry for testing
    export MAX_RETRIES=3
    export INITIAL_RETRY_DELAY=1

    if retry_with_backoff test_command_success; then
        echo -e "${GREEN}✓${NC} Command succeeded after retries"
        echo "  Total attempts: $ATTEMPT_COUNT"
    else
        echo -e "${RED}✗${NC} Command failed unexpectedly"
    fi
}

# Test 2: Command fails with retry exhaustion
test_retry_exhaustion() {
    echo -e "\n${YELLOW}Test 2: Retry exhaustion${NC}"

    ATTEMPT_COUNT=0

    # Command that always fails
    test_command_fail() {
        ((ATTEMPT_COUNT++))
        echo "  - Attempt $ATTEMPT_COUNT: Simulating persistent failure"
        return 83  # NETWORK_ERROR - retryable
    }

    # Configure retry settings
    export MAX_RETRIES=2
    export INITIAL_RETRY_DELAY=1

    if ! retry_with_backoff test_command_fail 2>/dev/null; then
        echo -e "${GREEN}✓${NC} Retry exhausted as expected"
        echo "  Total attempts: $ATTEMPT_COUNT (max was $MAX_RETRIES + 1)"
    else
        echo -e "${RED}✗${NC} Command succeeded unexpectedly"
    fi
}

# Test 3: Non-retryable error
test_non_retryable() {
    echo -e "\n${YELLOW}Test 3: Non-retryable error${NC}"

    ATTEMPT_COUNT=0

    # Command that returns non-retryable error
    test_command_non_retryable() {
        ((ATTEMPT_COUNT++))
        echo "  - Attempt $ATTEMPT_COUNT: Returning non-retryable error (exit code 2)"
        return 2  # Non-retryable by default
    }

    export MAX_RETRIES=5

    if ! retry_with_backoff test_command_non_retryable 2>/dev/null; then
        echo -e "${GREEN}✓${NC} Stopped after non-retryable error"
        echo "  Total attempts: $ATTEMPT_COUNT (should be 1)"
    else
        echo -e "${RED}✗${NC} Command succeeded unexpectedly"
    fi
}

# Test 4: Exponential backoff timing
test_exponential_backoff() {
    echo -e "\n${YELLOW}Test 4: Exponential backoff timing${NC}"

    ATTEMPT_COUNT=0
    local start_time=$(date +%s)

    # Command that fails 3 times
    test_command_backoff() {
        ((ATTEMPT_COUNT++))
        local current_time=$(date +%s)
        local elapsed=$((current_time - start_time))
        echo "  - Attempt $ATTEMPT_COUNT at ${elapsed}s"

        if [[ $ATTEMPT_COUNT -lt 4 ]]; then
            return 83  # NETWORK_ERROR - retryable
        else
            return 0
        fi
    }

    # Configure backoff
    export MAX_RETRIES=4
    export INITIAL_RETRY_DELAY=2
    export RETRY_BACKOFF_FACTOR=2

    if retry_with_backoff test_command_backoff; then
        local end_time=$(date +%s)
        local total_time=$((end_time - start_time))

        echo -e "${GREEN}✓${NC} Exponential backoff completed"
        echo "  Expected delays: 0s, 2s, 4s, 8s"
        echo "  Total time: ${total_time}s"
    else
        echo -e "${RED}✗${NC} Exponential backoff failed"
    fi
}

# Test 5: Retry policy configuration
test_retry_policies() {
    echo -e "\n${YELLOW}Test 5: Retry policy configuration${NC}"

    # Test critical policy
    configure_retry_policy "critical"
    echo "Critical policy:"
    echo "  - MAX_RETRIES: $MAX_RETRIES"
    echo "  - INITIAL_RETRY_DELAY: $INITIAL_RETRY_DELAY"
    echo "  - MAX_RETRY_DELAY: $MAX_RETRY_DELAY"

    # Test batch policy
    configure_retry_policy "batch"
    echo -e "\nBatch policy:"
    echo "  - MAX_RETRIES: $MAX_RETRIES"
    echo "  - INITIAL_RETRY_DELAY: $INITIAL_RETRY_DELAY"
    echo "  - MAX_RETRY_DELAY: $MAX_RETRY_DELAY"

    # Test normal policy
    configure_retry_policy "normal"
    echo -e "\nNormal policy:"
    echo "  - MAX_RETRIES: $MAX_RETRIES"
    echo "  - INITIAL_RETRY_DELAY: $INITIAL_RETRY_DELAY"
    echo "  - MAX_RETRY_DELAY: $MAX_RETRY_DELAY"

    echo -e "${GREEN}✓${NC} All retry policies configured correctly"
}

# Test 6: Rclone retry wrapper
test_rclone_retry() {
    echo -e "\n${YELLOW}Test 6: Rclone retry wrapper${NC}"

    # Simulate rclone command
    mock_rclone() {
        echo "Mock rclone $*" >> "$TEST_LOG"

        # Simulate network error on first attempt
        if [[ ! -f "$RETRY_TEST_DIR/rclone_success" ]]; then
            touch "$RETRY_TEST_DIR/rclone_success"
            echo "  - Simulating network failure"
            return 28  # Curl timeout error (retryable)
        else
            echo "  - Simulating successful transfer"
            return 0
        fi
    }

    # Clean up previous test
    rm -f "$RETRY_TEST_DIR/rclone_success"

    # Test rclone retry
    if retry_rclone_operation mock_rclone copy source dest; then
        echo -e "${GREEN}✓${NC} Rclone operation succeeded with retry"

        # Check that it used special rclone settings
        if [[ $MAX_RETRIES -eq 3 ]]; then
            echo -e "${GREEN}✓${NC} Rclone retry settings restored"
        fi
    else
        echo -e "${RED}✗${NC} Rclone operation failed"
    fi
}

# Test 7: Retry metrics recording
test_retry_metrics() {
    echo -e "\n${YELLOW}Test 7: Retry metrics recording${NC}"

    # Create mock reports directory
    export reports_base="$RETRY_TEST_DIR"

    # Record some metrics
    record_retry_metrics "test_job_1" 0 1 0
    record_retry_metrics "test_job_2" 2 1 6
    record_retry_metrics "test_job_3" 5 0 62

    # Check metrics file
    if [[ -f "$reports_base/retry_metrics.csv" ]]; then
        echo -e "${GREEN}✓${NC} Retry metrics file created"
        echo -e "\nRecorded metrics:"
        column -t -s, "$reports_base/retry_metrics.csv"
    else
        echo -e "${RED}✗${NC} Retry metrics file not created"
    fi
}

# Test 8: Error code classification
test_error_classification() {
    echo -e "\n${YELLOW}Test 8: Error code classification${NC}"

    # Test various error codes
    test_errors=(
        "83:Network error:retryable"
        "124:Timeout error:retryable"
        "28:Curl timeout:retryable"
        "7:Curl connection failed:retryable"
        "255:SSH error:retryable"
        "1:Generic error:non-retryable"
        "2:Misuse of shell:non-retryable"
        "126:Permission denied:non-retryable"
        "20:Unknown error 20:non-retryable"
    )

    for test_case in "${test_errors[@]}"; do
        IFS=':' read -r code desc expected <<< "$test_case"

        if is_retryable_error "$code"; then
            result="retryable"
        else
            result="non-retryable"
        fi

        if [[ "$result" == "$expected" ]]; then
            echo -e "${GREEN}✓${NC} Exit code $code ($desc): $result"
        else
            echo -e "${RED}✗${NC} Exit code $code ($desc): expected $expected, got $result"
        fi
    done
}

# Cleanup function
cleanup() {
    rm -f "$TEST_LOG"
    rm -rf "$RETRY_TEST_DIR"
}

# Main execution
main() {
    # Set trap for cleanup
    trap cleanup EXIT

    # Run core tests only (skip advanced tests that may not be implemented)
    test_retry_success
    test_retry_exhaustion
    test_non_retryable
    test_exponential_backoff

    # Skip these tests if functions don't exist
    if type configure_retry_policy >/dev/null 2>&1; then
        test_retry_policies
    else
        echo -e "\n${YELLOW}Skipping retry policies test (function not available)${NC}"
    fi

    if type test_error_classification >/dev/null 2>&1; then
        test_error_classification
    else
        echo -e "\n${YELLOW}Skipping error classification test (function not available)${NC}"
    fi

    echo -e "\n${BLUE}======================================${NC}"
    echo -e "${GREEN}Core retry mechanism tests completed${NC}"
    echo -e "${BLUE}======================================${NC}"
}

# Run if executed directly
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
    main "$@"
fi