llamandcoco · llama90 · Jan 4, 2026 · Jan 3, 2026 · Jan 3, 2026 · Jan 3, 2026
diff --git a/applications/chatops/slack-bot/performance-tests/README.md b/applications/chatops/slack-bot/performance-tests/README.md
@@ -177,6 +177,26 @@ Includes:
 
 ## Advanced Usage
 
+### Configuring CloudWatch Logs Query Wait Time
+
+The `analyze-performance.sh` script uses a polling mechanism to wait for CloudWatch Logs Insights queries to complete. You can configure the behavior using environment variables:
+
+```bash
+# Maximum time to wait for a query to complete (default: 60 seconds)
+export CLOUDWATCH_QUERY_MAX_WAIT=120
+
+# Interval between status checks (default: 2 seconds)
+export CLOUDWATCH_QUERY_POLL_INTERVAL=3
+
+# Run analysis
+./analyze-performance.sh --from-test
+```
+
+These settings are useful when:
+- Querying large log volumes that take longer to process
+- Operating in regions with higher latency
+- Running queries during periods of high CloudWatch API load
+
 ### Custom Test Duration
 
 Edit `artillery-config.yml` phases:

diff --git a/applications/chatops/slack-bot/performance-tests/analyze-performance.sh b/applications/chatops/slack-bot/performance-tests/analyze-performance.sh
@@ -6,6 +6,10 @@ set -e
 ENVIRONMENT="${ENVIRONMENT:-plt}"
 REGION="ca-central-1"
 
+# Configurable query wait parameters
+QUERY_MAX_WAIT_TIME="${CLOUDWATCH_QUERY_MAX_WAIT:-60}"  # Maximum wait time in seconds
+QUERY_POLL_INTERVAL="${CLOUDWATCH_QUERY_POLL_INTERVAL:-2}"  # Poll interval in seconds
+
 # Check for flags
 OUTPUT_JSON=false
 USE_TEST_RESULT=false
@@ -52,11 +56,60 @@ minutes_ago_timestamp() {
   echo $((now - (minutes * 60)))
 }
 
+# Helper: Poll CloudWatch Logs Insights query until completion
+# Usage: wait_for_query_completion <query_id> <region>
+# Returns: 0 if query completed successfully, 1 if timeout or failed
+wait_for_query_completion() {
+  local query_id=$1
+  local region=$2
+  local elapsed=0
+
+  while [ $elapsed -lt $QUERY_MAX_WAIT_TIME ]; do
+    # Get query results and check for AWS CLI errors
+    local aws_output=$(aws logs get-query-results --query-id "$query_id" --region "$region" --output json 2>&1)
+    local aws_exit_code=$?
+
+    if [ $aws_exit_code -ne 0 ]; then
+      echo_info "  ⚠ AWS CLI error for query $query_id: $aws_output" >&2
+      return 1
+    fi
+
+    local status=$(echo "$aws_output" | jq -r '.status' 2>/dev/null)
+
+    if [ -z "$status" ] || [ "$status" = "null" ]; then
+      echo_info "  ⚠ Unable to parse query status for $query_id" >&2
+      return 1
+    fi
+
+    case "$status" in
+      "Complete")
+        return 0
+        ;;
+      "Failed"|"Cancelled"|"Timeout")
+        echo_info "  ⚠ Query $query_id failed with status: $status" >&2
+        return 1
+        ;;
+      "Running"|"Scheduled")
+        sleep $QUERY_POLL_INTERVAL
+        elapsed=$((elapsed + QUERY_POLL_INTERVAL))
+        ;;
+      *)
+        # Unknown status - log it for debugging
+        echo_info "  ⚠ Unexpected query status for $query_id: '$status'" >&2
+        return 1
+        ;;
+    esac
+  done
+
+  echo_info "  ⚠ Query $query_id timed out after ${QUERY_MAX_WAIT_TIME}s" >&2
+  return 1
+}
+
 
 # Determine time range
 if [ "$USE_TEST_RESULT" = true ]; then
-  # Use latest Artillery test result
-  LATEST_RESULT=$(ls -t results/*.json 2>/dev/null | head -1)
+  # Use latest Artillery test result (exclude .metrics.json files)
+  LATEST_RESULT=$(ls -t results/*.json 2>/dev/null | grep -v '\.metrics\.json' | head -1)
 
   if [ -z "$LATEST_RESULT" ]; then
     echo "Error: No Artillery test results found in results/ directory"
@@ -79,18 +132,22 @@ if [ "$USE_TEST_RESULT" = true ]; then
     echo "Error: Could not extract timestamps from $LATEST_RESULT"
     exit 1
   fi
-
-  START_TIME_HUMAN=$(timestamp_to_date $((START_TIMESTAMP / 1000)))
-  END_TIME_HUMAN=$(timestamp_to_date $((END_TIMESTAMP / 1000)))
-
+
+  # Convert milliseconds to seconds for AWS CLI
+  START_TIMESTAMP=$((START_TIMESTAMP / 1000))
+  END_TIMESTAMP=$((END_TIMESTAMP / 1000))
+
+  START_TIME_HUMAN=$(timestamp_to_date $START_TIMESTAMP)
+  END_TIME_HUMAN=$(timestamp_to_date $END_TIMESTAMP)
+
   echo_info "Test window: $START_TIME_HUMAN ~ $END_TIME_HUMAN"
   echo_info ""
 else
   # Use traditional time range (last N minutes)
   START_TIME="${1:-15}"
   echo_info "Analyzing last ${START_TIME} minutes of logs..."
   echo_info ""
-  
+
   # Calculate timestamps (seconds, for AWS CLI)
   END_TIMESTAMP=$(date +%s)
   START_TIMESTAMP=$((END_TIMESTAMP - (START_TIME * 60)))
@@ -99,6 +156,26 @@ fi
 # Add 5-minute buffer to end time to account for log ingestion delay
 END_TIMESTAMP_BUFFERED=$((END_TIMESTAMP + 300))
 
+# Calculate optimal period for CloudWatch metrics to avoid exceeding 1440 datapoints limit
+# Formula: period = max(60, ceil(duration / 1440) * 60) rounded up to nearest minute
+TEST_DURATION=$((END_TIMESTAMP - START_TIMESTAMP))
+CLOUDWATCH_PERIOD=60
+if [ $TEST_DURATION -gt 86400 ]; then
+  # For duration > 1 day, use 5-minute periods
+  CLOUDWATCH_PERIOD=300
+elif [ $TEST_DURATION -gt 43200 ]; then
+  # For duration > 12 hours, use 3-minute periods
+  CLOUDWATCH_PERIOD=180
+elif [ $TEST_DURATION -gt 14400 ]; then
+  # For duration > 4 hours, use 2-minute periods
+  CLOUDWATCH_PERIOD=120
+else
+  # For duration <= 4 hours, use 1-minute periods (gives max 240 datapoints for 4 hours)
+  CLOUDWATCH_PERIOD=60
+fi
+
+echo_info "CloudWatch period: ${CLOUDWATCH_PERIOD}s (test duration: ${TEST_DURATION}s)"
+
 # Initialize JSON output structure
 if [ "$OUTPUT_JSON" = true ]; then
   JSON_OUTPUT="{\"timestamp\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\",\"environment\":\"$ENVIRONMENT\",\"timeRange\":{\"start\":$((START_TIMESTAMP * 1000)),\"end\":$((END_TIMESTAMP * 1000))},\"metrics\":{}}"
@@ -139,8 +216,9 @@ fields @timestamp, totalE2eMs, workerDurationMs, queueWaitMs, syncResponseMs, as
 if [ -f /tmp/e2e-query.json ]; then
   E2E_QUERY_ID=$(cat /tmp/e2e-query.json | jq -r '.queryId' 2>/dev/null)
   if [ "$E2E_QUERY_ID" != "null" ] && [ -n "$E2E_QUERY_ID" ]; then
-    sleep 8
-    aws logs get-query-results --query-id ${E2E_QUERY_ID} --region ${REGION} --output table 2>/dev/null || echo "  ⚠ Query failed"
+    if wait_for_query_completion "$E2E_QUERY_ID" "$REGION"; then
+      aws logs get-query-results --query-id ${E2E_QUERY_ID} --region ${REGION} --output table 2>/dev/null || echo "  ⚠ Query failed"
+    fi
   fi
 fi
 
@@ -185,21 +263,22 @@ fields @timestamp, @duration, @billedDuration, @memorySize, @maxMemoryUsed
 
 ROUTER_QUERY_ID=$(cat /tmp/router-query.json | jq -r '.queryId')
 
-# Wait for queries to complete
-sleep 5
-
 echo "Total Invocations (START events):"
-aws logs get-query-results \
-  --query-id ${ROUTER_START_QUERY_ID} \
-  --region ${REGION} \
-  --output table
+if wait_for_query_completion "$ROUTER_START_QUERY_ID" "$REGION"; then
+  aws logs get-query-results \
+    --query-id ${ROUTER_START_QUERY_ID} \
+    --region ${REGION} \
+    --output table
+fi
 
 echo ""
 echo "Completed Invocations (REPORT events with performance data):"
-aws logs get-query-results \
-  --query-id ${ROUTER_QUERY_ID} \
-  --region ${REGION} \
-  --output table
+if wait_for_query_completion "$ROUTER_QUERY_ID" "$REGION"; then
+  aws logs get-query-results \
+    --query-id ${ROUTER_QUERY_ID} \
+    --region ${REGION} \
+    --output table
+fi
 
 echo ""
 
@@ -242,20 +321,22 @@ fields @timestamp, @duration, @billedDuration, @memorySize, @maxMemoryUsed
 
 WORKER_QUERY_ID=$(cat /tmp/worker-query.json | jq -r '.queryId')
 
-sleep 5
-
 echo "Total Invocations (START events):"
-aws logs get-query-results \
-  --query-id ${WORKER_START_QUERY_ID} \
-  --region ${REGION} \
-  --output table
+if wait_for_query_completion "$WORKER_START_QUERY_ID" "$REGION"; then
+  aws logs get-query-results \
+    --query-id ${WORKER_START_QUERY_ID} \
+    --region ${REGION} \
+    --output table
+fi
 
 echo ""
 echo "Completed Invocations (REPORT events with performance data):"
-aws logs get-query-results \
-  --query-id ${WORKER_QUERY_ID} \
-  --region ${REGION} \
-  --output table
+if wait_for_query_completion "$WORKER_QUERY_ID" "$REGION"; then
+  aws logs get-query-results \
+    --query-id ${WORKER_QUERY_ID} \
+    --region ${REGION} \
+    --output table
+fi
 
 echo ""
 
@@ -276,8 +357,9 @@ fields @timestamp, @message
 ' > /tmp/router-errors.json
 
 ROUTER_ERROR_ID=$(cat /tmp/router-errors.json | jq -r '.queryId')
-sleep 5
-aws logs get-query-results --query-id ${ROUTER_ERROR_ID} --region ${REGION} --output table
+if wait_for_query_completion "$ROUTER_ERROR_ID" "$REGION"; then
+  aws logs get-query-results --query-id ${ROUTER_ERROR_ID} --region ${REGION} --output table
+fi
 
 echo ""
 echo "Worker Errors:"
@@ -294,8 +376,9 @@ fields @timestamp, @message
 ' > /tmp/worker-errors.json
 
 WORKER_ERROR_ID=$(cat /tmp/worker-errors.json | jq -r '.queryId')
-sleep 5
-aws logs get-query-results --query-id ${WORKER_ERROR_ID} --region ${REGION} --output table
+if wait_for_query_completion "$WORKER_ERROR_ID" "$REGION"; then
+  aws logs get-query-results --query-id ${WORKER_ERROR_ID} --region ${REGION} --output table
+fi
 
 echo ""
 
@@ -318,8 +401,9 @@ fields @timestamp, @initDuration
 ' > /tmp/router-cold.json
 
 ROUTER_COLD_ID=$(cat /tmp/router-cold.json | jq -r '.queryId')
-sleep 5
-aws logs get-query-results --query-id ${ROUTER_COLD_ID} --region ${REGION} --output table
+if wait_for_query_completion "$ROUTER_COLD_ID" "$REGION"; then
+  aws logs get-query-results --query-id ${ROUTER_COLD_ID} --region ${REGION} --output table
+fi
 
 echo ""
 echo "Worker Cold Starts:"
@@ -338,8 +422,9 @@ fields @timestamp, @initDuration
 ' > /tmp/worker-cold.json
 
 WORKER_COLD_ID=$(cat /tmp/worker-cold.json | jq -r '.queryId')
-sleep 5
-aws logs get-query-results --query-id ${WORKER_COLD_ID} --region ${REGION} --output table
+if wait_for_query_completion "$WORKER_COLD_ID" "$REGION"; then
+  aws logs get-query-results --query-id ${WORKER_COLD_ID} --region ${REGION} --output table
+fi
 
 echo_info ""
 echo_info "========================================"
@@ -363,10 +448,10 @@ aws cloudwatch get-metric-statistics \
   --dimensions Name=FunctionName,Value=laco-${ENVIRONMENT}-slack-router \
   --start-time $(date -u -d @$START_TIMESTAMP +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -r $START_TIMESTAMP +%Y-%m-%dT%H:%M:%S) \
   --end-time $(date -u +%Y-%m-%dT%H:%M:%S) \
-  --period 60 \
-  --statistics Maximum,Average \
+  --period ${CLOUDWATCH_PERIOD} \
+  --statistics Maximum Average \
   --region ${REGION} \
-  --output table
+  --output table 2>/dev/null || echo "  (No data)"
 
 echo_info ""
 echo_info "Echo Worker Lambda:"
@@ -376,10 +461,10 @@ aws cloudwatch get-metric-statistics \
   --dimensions Name=FunctionName,Value=laco-${ENVIRONMENT}-chatbot-echo-worker \
   --start-time $(date -u -d @$START_TIMESTAMP +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -r $START_TIMESTAMP +%Y-%m-%dT%H:%M:%S) \
   --end-time $(date -u +%Y-%m-%dT%H:%M:%S) \
-  --period 60 \
-  --statistics Maximum,Average \
+  --period ${CLOUDWATCH_PERIOD} \
+  --statistics Maximum Average \
   --region ${REGION} \
-  --output table
+  --output table 2>/dev/null || echo "  (No data)"
 
 echo_info ""
 echo_info "6. Throttles"
@@ -392,10 +477,10 @@ aws cloudwatch get-metric-statistics \
   --dimensions Name=FunctionName,Value=laco-${ENVIRONMENT}-slack-router \
   --start-time $(date -u -d @$START_TIMESTAMP +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -r $START_TIMESTAMP +%Y-%m-%dT%H:%M:%S) \
   --end-time $(date -u +%Y-%m-%dT%H:%M:%S) \
-  --period 60 \
+  --period ${CLOUDWATCH_PERIOD} \
   --statistics Sum \
   --region ${REGION} \
-  --output table
+  --output table 2>/dev/null || echo "  (No data)"
 
 echo_info ""
 echo_info "Worker Throttles:"
@@ -405,10 +490,10 @@ aws cloudwatch get-metric-statistics \
   --dimensions Name=FunctionName,Value=laco-${ENVIRONMENT}-chatbot-echo-worker \
   --start-time $(date -u -d @$START_TIMESTAMP +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -r $START_TIMESTAMP +%Y-%m-%dT%H:%M:%S) \
   --end-time $(date -u +%Y-%m-%dT%H:%M:%S) \
-  --period 60 \
+  --period ${CLOUDWATCH_PERIOD} \
   --statistics Sum \
   --region ${REGION} \
-  --output table
+  --output table 2>/dev/null || echo "  (No data)"
 
 echo_info ""
 echo_info "========================================"
@@ -426,10 +511,10 @@ aws cloudwatch get-metric-statistics \
   --dimensions Name=QueueName,Value=laco-${ENVIRONMENT}-chatbot-echo \
   --start-time $(date -u -d @$START_TIMESTAMP +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -r $START_TIMESTAMP +%Y-%m-%dT%H:%M:%S) \
   --end-time $(date -u +%Y-%m-%dT%H:%M:%S) \
-  --period 60 \
-  --statistics Average,Maximum \
+  --period ${CLOUDWATCH_PERIOD} \
+  --statistics Average Maximum \
   --region ${REGION} \
-  --output table
+  --output table 2>/dev/null || echo "  (No data)"
 
 echo_info ""
 echo_info "8. SQS Queue Depth"
@@ -441,10 +526,10 @@ aws cloudwatch get-metric-statistics \
   --dimensions Name=QueueName,Value=laco-${ENVIRONMENT}-chatbot-echo \
   --start-time $(date -u -d @$START_TIMESTAMP +%Y-%m-%dT%H:%M:%S 2>/dev/null || date -u -r $START_TIMESTAMP +%Y-%m-%dT%H:%M:%S) \
   --end-time $(date -u +%Y-%m-%dT%H:%M:%S) \
-  --period 60 \
-  --statistics Average,Maximum \
+  --period ${CLOUDWATCH_PERIOD} \
+  --statistics Average Maximum \
   --region ${REGION} \
-  --output table
+  --output table 2>/dev/null || echo "  (No data)"
 
 fi  # End of quiet mode skip