From 1b231b9a7588ff4b83b6298cff023b677a363602 Mon Sep 17 00:00:00 2001 From: Peter Rival Date: Thu, 11 Dec 2025 17:33:50 -0500 Subject: [PATCH] Add GitHub Actions CI pipeline for multi-cloud and bare metal testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds comprehensive CI/CD infrastructure to test Zathras across AWS, Azure, GCP, and bare metal environments. Features: - OIDC authentication for secure, keyless cloud access - Automated testing on PRs and manual workflow dispatch - Standard test suite (linpack, streams, fio) for each environment - Cost-optimized with spot instances and automatic resource cleanup - PR result comments and artifact uploads - Self-hosted runner support for bare metal testing Components: - 4 cloud-specific workflows (test-aws, test-azure, test-gcp, test-baremetal) - 2 reusable composite actions (setup-zathras, run-zathras-test) - Test scenarios for each environment - Comprehensive setup documentation Total test runtime: ~15-20 minutes per cloud Estimated cost: ~$0.10-0.20 per cloud run See ci/CI_SETUP.md for complete setup instructions. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .github/actions/run-zathras-test/action.yml | 135 +++++ .github/actions/setup-zathras/action.yml | 80 +++ .github/workflows/test-aws.yml | 132 +++++ .github/workflows/test-azure.yml | 123 +++++ .github/workflows/test-baremetal.yml | 218 ++++++++ .github/workflows/test-gcp.yml | 141 ++++++ ci/CI_SETUP.md | 520 ++++++++++++++++++++ ci/README.md | 98 +++- ci/test_scenarios/README.md | 45 ++ ci/test_scenarios/aws_ci_test.yml | 33 ++ ci/test_scenarios/azure_ci_test.yml | 33 ++ ci/test_scenarios/baremetal_ci_test.yml | 33 ++ ci/test_scenarios/gcp_ci_test.yml | 31 ++ 13 files changed, 1609 insertions(+), 13 deletions(-) create mode 100644 .github/actions/run-zathras-test/action.yml create mode 100644 .github/actions/setup-zathras/action.yml create mode 100644 .github/workflows/test-aws.yml create mode 100644 .github/workflows/test-azure.yml create mode 100644 .github/workflows/test-baremetal.yml create mode 100644 .github/workflows/test-gcp.yml create mode 100644 ci/CI_SETUP.md create mode 100644 ci/test_scenarios/README.md create mode 100644 ci/test_scenarios/aws_ci_test.yml create mode 100644 ci/test_scenarios/azure_ci_test.yml create mode 100644 ci/test_scenarios/baremetal_ci_test.yml create mode 100644 ci/test_scenarios/gcp_ci_test.yml diff --git a/.github/actions/run-zathras-test/action.yml b/.github/actions/run-zathras-test/action.yml new file mode 100644 index 00000000..64a4cb55 --- /dev/null +++ b/.github/actions/run-zathras-test/action.yml @@ -0,0 +1,135 @@ +name: 'Run Zathras Test' +description: 'Execute Zathras benchmark test and validate results' + +inputs: + scenario-file: + description: 'Path to the test scenario file' + required: true + cloud-os-id: + description: 'Cloud OS ID (AMI, image, etc.)' + required: false + additional-args: + description: 'Additional arguments to pass to burden' + required: false + default: '' + +outputs: + test-status: + description: 'Test execution status (success/failure)' + value: ${{ steps.validate.outputs.status }} + results-path: + description: 'Path to test results directory' + value: ${{ steps.run.outputs.results_path }} + +runs: + using: "composite" + steps: + - name: Prepare scenario file + id: prepare + shell: bash + env: + SCENARIO_FILE: ${{ inputs.scenario-file }} + CLOUD_OS_ID: ${{ inputs.cloud-os-id }} + run: | + # Copy scenario file to temporary location and inject runtime values + SCENARIO_TEMP="/tmp/zathras_ci_scenario_${{ github.run_id }}.yml" + cp "$SCENARIO_FILE" "$SCENARIO_TEMP" + + # Inject cloud_os_id if provided + if [ -n "$CLOUD_OS_ID" ]; then + # Use yq to add cloud_os_id to global section + python3 -c " + import yaml + import sys + with open('$SCENARIO_TEMP', 'r') as f: + data = yaml.safe_load(f) + if 'global' not in data: + data['global'] = {} + data['global']['cloud_os_id'] = '$CLOUD_OS_ID' + data['global']['ssh_key_file'] = '$ZATHRAS_SSH_KEY' + with open('$SCENARIO_TEMP', 'w') as f: + yaml.dump(data, f, default_flow_style=False) + " + fi + + echo "scenario_path=$SCENARIO_TEMP" >> $GITHUB_OUTPUT + echo "Using scenario file: $SCENARIO_TEMP" + cat "$SCENARIO_TEMP" + + - name: Run Zathras test + id: run + shell: bash + env: + SCENARIO_PATH: ${{ steps.prepare.outputs.scenario_path }} + ADDITIONAL_ARGS: ${{ inputs.additional-args }} + run: | + # Execute burden with scenario file + echo "Starting Zathras test..." + echo "Scenario: $SCENARIO_PATH" + echo "Additional args: $ADDITIONAL_ARGS" + + # Run burden and capture output + set +e # Don't exit on error, we want to capture the exit code + ./bin/burden --scenario "$SCENARIO_PATH" $ADDITIONAL_ARGS + BURDEN_EXIT_CODE=$? + set -e + + echo "burden_exit_code=$BURDEN_EXIT_CODE" >> $GITHUB_OUTPUT + + # Find results directory (burden creates timestamped directories) + RESULTS_DIR=$(find . -maxdepth 1 -type d -name "zathras_*" -printf '%T@ %p\n' | sort -n | tail -1 | cut -d' ' -f2-) + if [ -z "$RESULTS_DIR" ]; then + RESULTS_DIR="./results" + fi + echo "results_path=$RESULTS_DIR" >> $GITHUB_OUTPUT + + exit $BURDEN_EXIT_CODE + + - name: Validate test results + id: validate + shell: bash + env: + RESULTS_PATH: ${{ steps.run.outputs.results_path }} + BURDEN_EXIT_CODE: ${{ steps.run.outputs.burden_exit_code }} + run: | + echo "Validating test results..." + echo "Results directory: $RESULTS_PATH" + echo "Burden exit code: $BURDEN_EXIT_CODE" + + # Check if burden succeeded + if [ "$BURDEN_EXIT_CODE" -ne 0 ]; then + echo "❌ Burden failed with exit code: $BURDEN_EXIT_CODE" + echo "status=failure" >> $GITHUB_OUTPUT + exit 1 + fi + + # Check if results directory exists + if [ ! -d "$RESULTS_PATH" ]; then + echo "❌ Results directory not found: $RESULTS_PATH" + echo "status=failure" >> $GITHUB_OUTPUT + exit 1 + fi + + # Check if any result files were generated + RESULT_FILES=$(find "$RESULTS_PATH" -type f -name "*.json" -o -name "*.csv" -o -name "*.txt" | wc -l) + if [ "$RESULT_FILES" -eq 0 ]; then + echo "⚠️ Warning: No result files found in $RESULTS_PATH" + else + echo "✅ Found $RESULT_FILES result files" + fi + + # List results + echo "Results directory contents:" + ls -lah "$RESULTS_PATH" || true + + echo "✅ Test completed successfully" + echo "status=success" >> $GITHUB_OUTPUT + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: zathras-results-${{ github.run_id }}-${{ strategy.job-index || '0' }} + path: ${{ steps.run.outputs.results_path }} + retention-days: 30 + if-no-files-found: warn diff --git a/.github/actions/setup-zathras/action.yml b/.github/actions/setup-zathras/action.yml new file mode 100644 index 00000000..047ac4f2 --- /dev/null +++ b/.github/actions/setup-zathras/action.yml @@ -0,0 +1,80 @@ +name: 'Setup Zathras' +description: 'Install Zathras dependencies and prepare for testing' + +inputs: + ssh-private-key: + description: 'SSH private key for accessing cloud instances' + required: true + +outputs: + zathras-version: + description: 'Zathras git commit SHA' + value: ${{ steps.version.outputs.sha }} + +runs: + using: "composite" + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Get Zathras version + id: version + shell: bash + run: echo "sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT + + - name: Install system dependencies + shell: bash + run: | + # Install required system packages + sudo dnf install -y \ + ansible-core \ + git \ + jq \ + python3 \ + python3-pip \ + unzip \ + wget + + - name: Install Terraform + shell: bash + run: | + # Install Terraform 1.9.8 + TERRAFORM_VERSION="1.9.8" + wget -q https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraform_${TERRAFORM_VERSION}_linux_amd64.zip + unzip -q terraform_${TERRAFORM_VERSION}_linux_amd64.zip + sudo mv terraform /usr/local/bin/ + terraform --version + + - name: Install Python dependencies + shell: bash + run: | + # Install required Python packages + pip install --user boto boto3 yq==2.10.0 + + - name: Install Ansible collections + shell: bash + run: | + # Install required Ansible collections + ansible-galaxy collection install amazon.aws:9.1.0 + ansible-galaxy collection install ansible.posix + ansible-galaxy collection install community.aws + ansible-galaxy collection install community.general + + - name: Set up SSH key + shell: bash + env: + SSH_PRIVATE_KEY: ${{ inputs.ssh-private-key }} + run: | + # Configure SSH key for cloud instance access + mkdir -p ~/.ssh + echo "$SSH_PRIVATE_KEY" > ~/.ssh/zathras_ci_key + chmod 600 ~/.ssh/zathras_ci_key + echo "ZATHRAS_SSH_KEY=$HOME/.ssh/zathras_ci_key" >> $GITHUB_ENV + + - name: Verify Zathras installation + shell: bash + run: | + # Verify bin/burden exists and is executable + ls -la bin/burden + file bin/burden + head -n 5 bin/burden diff --git a/.github/workflows/test-aws.yml b/.github/workflows/test-aws.yml new file mode 100644 index 00000000..8e08ae87 --- /dev/null +++ b/.github/workflows/test-aws.yml @@ -0,0 +1,132 @@ +name: Test AWS + +on: + # Manual trigger + workflow_dispatch: + inputs: + aws_region: + description: 'AWS region for testing' + required: false + default: 'us-east-1' + instance_type: + description: 'EC2 instance type (leave empty for default m5.xlarge)' + required: false + ami_id: + description: 'AMI ID (leave empty for default RHEL 9)' + required: false + + # Automatic trigger on PR to main + pull_request: + branches: + - main + paths: + - 'bin/**' + - 'ansible_roles/**' + - 'config/**' + - 'ci/test_scenarios/aws_ci_test.yml' + - '.github/workflows/test-aws.yml' + - '.github/actions/**' + +# Permissions required for OIDC authentication to AWS +permissions: + id-token: write # Required for requesting JWT + contents: read # Required for actions/checkout + pull-requests: write # Required for commenting on PRs + +env: + AWS_REGION: ${{ inputs.aws_region || 'us-east-1' }} + SCENARIO_FILE: ci/test_scenarios/aws_ci_test.yml + +jobs: + test-aws: + name: Test on AWS + runs-on: ubuntu-latest + timeout-minutes: 60 # Prevent runaway costs + + steps: + - name: Configure AWS credentials via OIDC + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }} + aws-region: ${{ env.AWS_REGION }} + role-session-name: ZathrasCI-${{ github.run_id }} + + - name: Verify AWS authentication + run: | + echo "Verifying AWS credentials..." + aws sts get-caller-identity + echo "AWS CLI configured successfully" + + - name: Set up Zathras + uses: ./.github/actions/setup-zathras + with: + ssh-private-key: ${{ secrets.AWS_SSH_PRIVATE_KEY }} + + - name: Determine AMI ID + id: ami + run: | + # Use provided AMI or find latest RHEL 9 AMI + if [ -n "${{ inputs.ami_id }}" ]; then + AMI_ID="${{ inputs.ami_id }}" + echo "Using provided AMI: $AMI_ID" + else + # Find latest RHEL 9 AMI in the region + AMI_ID=$(aws ec2 describe-images \ + --region ${{ env.AWS_REGION }} \ + --owners 309956199498 \ + --filters "Name=name,Values=RHEL-9.*_HVM-*-x86_64-*" \ + "Name=state,Values=available" \ + --query 'Images | sort_by(@, &CreationDate) | [-1].ImageId' \ + --output text) + echo "Found latest RHEL 9 AMI: $AMI_ID" + fi + echo "ami_id=$AMI_ID" >> $GITHUB_OUTPUT + + - name: Run Zathras test on AWS + id: test + uses: ./.github/actions/run-zathras-test + with: + scenario-file: ${{ env.SCENARIO_FILE }} + cloud-os-id: ${{ steps.ami.outputs.ami_id }} + additional-args: ${{ inputs.instance_type && format('--host_config {0}', inputs.instance_type) || '' }} + + - name: Check for orphaned resources + if: always() + run: | + echo "Checking for orphaned EC2 instances..." + aws ec2 describe-instances \ + --region ${{ env.AWS_REGION }} \ + --filters "Name=tag:ManagedBy,Values=Zathras" \ + "Name=instance-state-name,Values=running,pending,stopping,stopped" \ + --query 'Reservations[*].Instances[*].[InstanceId,State.Name,Tags[?Key==`Name`].Value|[0]]' \ + --output table || true + + - name: Post results to PR + if: github.event_name == 'pull_request' && always() + uses: actions/github-script@v7 + with: + script: | + const status = '${{ steps.test.outputs.test-status }}'; + const emoji = status === 'success' ? '✅' : '❌'; + const body = `## ${emoji} AWS Test Results + + **Status:** ${status} + **Region:** ${{ env.AWS_REGION }} + **AMI:** ${{ steps.ami.outputs.ami_id }} + **Workflow:** [Run #${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) + + ${status === 'success' ? 'All tests passed successfully!' : 'Tests failed. Check workflow logs for details.'} + `; + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: body + }); + + - name: Fail workflow if tests failed + if: steps.test.outputs.test-status != 'success' + run: | + echo "❌ AWS tests failed" + exit 1 diff --git a/.github/workflows/test-azure.yml b/.github/workflows/test-azure.yml new file mode 100644 index 00000000..8b3f6ab5 --- /dev/null +++ b/.github/workflows/test-azure.yml @@ -0,0 +1,123 @@ +name: Test Azure + +on: + # Manual trigger + workflow_dispatch: + inputs: + azure_region: + description: 'Azure region for testing' + required: false + default: 'eastus' + vm_size: + description: 'Azure VM size (leave empty for default Standard_D4s_v3)' + required: false + image_id: + description: 'Azure image ID or URN (leave empty for default RHEL 9)' + required: false + + # Automatic trigger on PR to main + pull_request: + branches: + - main + paths: + - 'bin/**' + - 'ansible_roles/**' + - 'config/**' + - 'ci/test_scenarios/azure_ci_test.yml' + - '.github/workflows/test-azure.yml' + - '.github/actions/**' + +# Permissions required for OIDC authentication to Azure +permissions: + id-token: write # Required for requesting JWT + contents: read # Required for actions/checkout + pull-requests: write # Required for commenting on PRs + +env: + AZURE_REGION: ${{ inputs.azure_region || 'eastus' }} + SCENARIO_FILE: ci/test_scenarios/azure_ci_test.yml + +jobs: + test-azure: + name: Test on Azure + runs-on: ubuntu-latest + timeout-minutes: 60 # Prevent runaway costs + + steps: + - name: Azure login via OIDC + uses: azure/login@v2 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + + - name: Verify Azure authentication + run: | + echo "Verifying Azure credentials..." + az account show + echo "Azure CLI configured successfully" + + - name: Set up Zathras + uses: ./.github/actions/setup-zathras + with: + ssh-private-key: ${{ secrets.AZURE_SSH_PRIVATE_KEY }} + + - name: Determine Azure image + id: image + run: | + # Use provided image or find latest RHEL 9 image + if [ -n "${{ inputs.image_id }}" ]; then + IMAGE_ID="${{ inputs.image_id }}" + echo "Using provided image: $IMAGE_ID" + else + # Use RHEL 9 URN (Publisher:Offer:Sku:Version) + IMAGE_ID="RedHat:RHEL:9-lvm-gen2:latest" + echo "Using default RHEL 9 image: $IMAGE_ID" + fi + echo "image_id=$IMAGE_ID" >> $GITHUB_OUTPUT + + - name: Run Zathras test on Azure + id: test + uses: ./.github/actions/run-zathras-test + with: + scenario-file: ${{ env.SCENARIO_FILE }} + cloud-os-id: ${{ steps.image.outputs.image_id }} + additional-args: ${{ inputs.vm_size && format('--host_config {0}', inputs.vm_size) || '' }} + + - name: Check for orphaned resources + if: always() + run: | + echo "Checking for orphaned resource groups..." + az group list \ + --query "[?tags.ManagedBy=='Zathras'].{Name:name, Location:location, State:properties.provisioningState}" \ + --output table || true + + - name: Post results to PR + if: github.event_name == 'pull_request' && always() + uses: actions/github-script@v7 + with: + script: | + const status = '${{ steps.test.outputs.test-status }}'; + const emoji = status === 'success' ? '✅' : '❌'; + const body = `## ${emoji} Azure Test Results + + **Status:** ${status} + **Region:** ${{ env.AZURE_REGION }} + **Image:** ${{ steps.image.outputs.image_id }} + **Workflow:** [Run #${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) + + ${status === 'success' ? 'All tests passed successfully!' : 'Tests failed. Check workflow logs for details.'} + `; + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: body + }); + + - name: Fail workflow if tests failed + if: steps.test.outputs.test-status != 'success' + run: | + echo "❌ Azure tests failed" + exit 1 diff --git a/.github/workflows/test-baremetal.yml b/.github/workflows/test-baremetal.yml new file mode 100644 index 00000000..9964d0cc --- /dev/null +++ b/.github/workflows/test-baremetal.yml @@ -0,0 +1,218 @@ +name: Test Bare Metal + +on: + # Manual trigger + workflow_dispatch: + inputs: + target_host: + description: 'Target bare metal hostname (must have local config file)' + required: true + ssh_user: + description: 'SSH username (leave empty for default)' + required: false + + # Automatic trigger on PR to main (only if self-hosted runner is available) + # Disabled by default - uncomment when runner is configured + # pull_request: + # branches: + # - main + # paths: + # - 'bin/**' + # - 'ansible_roles/**' + # - 'config/**' + # - 'ci/test_scenarios/baremetal_ci_test.yml' + # - '.github/workflows/test-baremetal.yml' + # - '.github/actions/**' + +permissions: + contents: read + pull-requests: write # Required for commenting on PRs + +env: + SCENARIO_FILE: ci/test_scenarios/baremetal_ci_test.yml + TARGET_HOST: ${{ inputs.target_host }} + +jobs: + test-baremetal: + name: Test on Bare Metal + # IMPORTANT: This requires a self-hosted runner with label 'baremetal' + # See documentation for setup instructions + runs-on: self-hosted + # Alternatively, use specific labels for different bare metal systems: + # runs-on: [self-hosted, linux, baremetal] + timeout-minutes: 60 + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Verify runner has Zathras dependencies + run: | + echo "Checking for required dependencies..." + + # Check for required commands + for cmd in ansible-galaxy terraform python3 pip3 git jq; do + if ! command -v $cmd &> /dev/null; then + echo "❌ Required command not found: $cmd" + echo "Please install dependencies on the self-hosted runner" + exit 1 + fi + echo "✅ Found: $cmd ($(command -v $cmd))" + done + + # Check Python packages + echo "Checking Python packages..." + pip3 list | grep -E "(boto|boto3|yq)" || true + + # Check Ansible collections + echo "Checking Ansible collections..." + ansible-galaxy collection list | grep -E "(amazon.aws|ansible.posix|community)" || true + + - name: Verify local config exists + run: | + CONFIG_FILE="local_configs/${{ env.TARGET_HOST }}.config" + if [ ! -f "$CONFIG_FILE" ]; then + echo "❌ Local config file not found: $CONFIG_FILE" + echo "Available configs:" + ls -1 local_configs/ || echo "No local_configs directory found" + exit 1 + fi + echo "✅ Found local config: $CONFIG_FILE" + cat "$CONFIG_FILE" + + - name: Verify SSH access to target host + run: | + echo "Testing SSH connection to ${{ env.TARGET_HOST }}..." + + # Use SSH key from secrets if provided, otherwise use runner's default + if [ -n "${{ secrets.BAREMETAL_SSH_PRIVATE_KEY }}" ]; then + mkdir -p ~/.ssh + echo "${{ secrets.BAREMETAL_SSH_PRIVATE_KEY }}" > ~/.ssh/zathras_baremetal_key + chmod 600 ~/.ssh/zathras_baremetal_key + SSH_KEY_ARG="-i ~/.ssh/zathras_baremetal_key" + else + SSH_KEY_ARG="" + fi + + # Determine SSH user + SSH_USER="${{ inputs.ssh_user }}" + if [ -z "$SSH_USER" ]; then + SSH_USER=$(whoami) + fi + + # Test SSH connection + if ssh $SSH_KEY_ARG -o BatchMode=yes -o ConnectTimeout=10 \ + ${SSH_USER}@${{ env.TARGET_HOST }} "echo 'SSH connection successful'"; then + echo "✅ SSH connection successful" + else + echo "❌ SSH connection failed" + echo "Ensure the runner has SSH access to ${{ env.TARGET_HOST }}" + exit 1 + fi + + - name: Prepare scenario file for bare metal + id: prepare + run: | + # Copy scenario and inject target hostname + SCENARIO_TEMP="/tmp/zathras_baremetal_${{ github.run_id }}.yml" + cp "${{ env.SCENARIO_FILE }}" "$SCENARIO_TEMP" + + # Inject target hostname into scenario + python3 << 'EOF' + import yaml + scenario_file = "$SCENARIO_TEMP" + target_host = "${{ env.TARGET_HOST }}" + + with open(scenario_file, 'r') as f: + data = yaml.safe_load(f) + + # Set host_config to target hostname + if 'systems' in data and 'ci_test_system' in data['systems']: + data['systems']['ci_test_system']['host_config'] = target_host + + with open(scenario_file, 'w') as f: + yaml.dump(data, f, default_flow_style=False) + + print(f"Updated scenario file with host: {target_host}") + EOF + + echo "scenario_path=$SCENARIO_TEMP" >> $GITHUB_OUTPUT + echo "Scenario file:" + cat "$SCENARIO_TEMP" + + - name: Run Zathras test on bare metal + id: test + run: | + echo "Starting Zathras test on bare metal..." + echo "Target host: ${{ env.TARGET_HOST }}" + echo "Scenario: ${{ steps.prepare.outputs.scenario_path }}" + + # Run burden and capture output + set +e + ./bin/burden --scenario "${{ steps.prepare.outputs.scenario_path }}" + BURDEN_EXIT_CODE=$? + set -e + + # Find results directory + RESULTS_DIR=$(find . -maxdepth 1 -type d -name "zathras_*" -printf '%T@ %p\n' | sort -n | tail -1 | cut -d' ' -f2-) + if [ -z "$RESULTS_DIR" ]; then + RESULTS_DIR="./results" + fi + echo "results_path=$RESULTS_DIR" >> $GITHUB_OUTPUT + + # Validate results + if [ "$BURDEN_EXIT_CODE" -ne 0 ]; then + echo "❌ Burden failed with exit code: $BURDEN_EXIT_CODE" + echo "test_status=failure" >> $GITHUB_OUTPUT + exit 1 + fi + + if [ -d "$RESULTS_DIR" ]; then + echo "✅ Test completed successfully" + echo "Results:" + ls -lah "$RESULTS_DIR" + echo "test_status=success" >> $GITHUB_OUTPUT + else + echo "❌ Results directory not found" + echo "test_status=failure" >> $GITHUB_OUTPUT + exit 1 + fi + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: zathras-baremetal-results-${{ github.run_id }} + path: ${{ steps.test.outputs.results_path }} + retention-days: 30 + if-no-files-found: warn + + - name: Post results to PR + if: github.event_name == 'pull_request' && always() + uses: actions/github-script@v7 + with: + script: | + const status = '${{ steps.test.outputs.test_status }}'; + const emoji = status === 'success' ? '✅' : '❌'; + const body = `## ${emoji} Bare Metal Test Results + + **Status:** ${status} + **Target Host:** ${{ env.TARGET_HOST }} + **Runner:** ${process.env.RUNNER_NAME || 'self-hosted'} + **Workflow:** [Run #${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) + + ${status === 'success' ? 'All tests passed successfully!' : 'Tests failed. Check workflow logs for details.'} + `; + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: body + }); + + - name: Fail workflow if tests failed + if: steps.test.outputs.test_status != 'success' + run: | + echo "❌ Bare metal tests failed" + exit 1 diff --git a/.github/workflows/test-gcp.yml b/.github/workflows/test-gcp.yml new file mode 100644 index 00000000..e7d556e6 --- /dev/null +++ b/.github/workflows/test-gcp.yml @@ -0,0 +1,141 @@ +name: Test GCP + +on: + # Manual trigger + workflow_dispatch: + inputs: + gcp_region: + description: 'GCP region for testing' + required: false + default: 'us-central1' + gcp_zone: + description: 'GCP zone for testing' + required: false + default: 'us-central1-a' + machine_type: + description: 'GCP machine type (leave empty for default n2-standard-4)' + required: false + image_family: + description: 'GCP image family (leave empty for default RHEL 9)' + required: false + + # Automatic trigger on PR to main + pull_request: + branches: + - main + paths: + - 'bin/**' + - 'ansible_roles/**' + - 'config/**' + - 'ci/test_scenarios/gcp_ci_test.yml' + - '.github/workflows/test-gcp.yml' + - '.github/actions/**' + +# Permissions required for OIDC authentication to GCP +permissions: + id-token: write # Required for requesting JWT + contents: read # Required for actions/checkout + pull-requests: write # Required for commenting on PRs + +env: + GCP_REGION: ${{ inputs.gcp_region || 'us-central1' }} + GCP_ZONE: ${{ inputs.gcp_zone || 'us-central1-a' }} + SCENARIO_FILE: ci/test_scenarios/gcp_ci_test.yml + +jobs: + test-gcp: + name: Test on GCP + runs-on: ubuntu-latest + timeout-minutes: 60 # Prevent runaway costs + + steps: + - name: Authenticate to Google Cloud via OIDC + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + + - name: Verify GCP authentication + run: | + echo "Verifying GCP credentials..." + gcloud auth list + gcloud config list + echo "GCP CLI configured successfully" + + - name: Set GCP project + run: | + # Extract project from service account or use secret + if [ -n "${{ secrets.GCP_PROJECT_ID }}" ]; then + gcloud config set project ${{ secrets.GCP_PROJECT_ID }} + fi + echo "Active project: $(gcloud config get-value project)" + + - name: Set up Zathras + uses: ./.github/actions/setup-zathras + with: + ssh-private-key: ${{ secrets.GCP_SSH_PRIVATE_KEY }} + + - name: Determine GCP image + id: image + run: | + # Use provided image family or default to RHEL 9 + if [ -n "${{ inputs.image_family }}" ]; then + IMAGE_FAMILY="${{ inputs.image_family }}" + else + IMAGE_FAMILY="rhel-9" + fi + + # Construct full image path + IMAGE_ID="projects/rhel-cloud/global/images/family/${IMAGE_FAMILY}" + echo "Using image: $IMAGE_ID" + echo "image_id=$IMAGE_ID" >> $GITHUB_OUTPUT + + - name: Run Zathras test on GCP + id: test + uses: ./.github/actions/run-zathras-test + with: + scenario-file: ${{ env.SCENARIO_FILE }} + cloud-os-id: ${{ steps.image.outputs.image_id }} + additional-args: ${{ inputs.machine_type && format('--host_config {0}', inputs.machine_type) || '' }} + + - name: Check for orphaned resources + if: always() + run: | + echo "Checking for orphaned GCE instances..." + PROJECT=$(gcloud config get-value project) + gcloud compute instances list \ + --filter="labels.managed_by=zathras" \ + --format="table(name,zone,status,creationTimestamp)" || true + + - name: Post results to PR + if: github.event_name == 'pull_request' && always() + uses: actions/github-script@v7 + with: + script: | + const status = '${{ steps.test.outputs.test-status }}'; + const emoji = status === 'success' ? '✅' : '❌'; + const body = `## ${emoji} GCP Test Results + + **Status:** ${status} + **Region/Zone:** ${{ env.GCP_REGION }}/${{ env.GCP_ZONE }} + **Image:** ${{ steps.image.outputs.image_id }} + **Workflow:** [Run #${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) + + ${status === 'success' ? 'All tests passed successfully!' : 'Tests failed. Check workflow logs for details.'} + `; + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: body + }); + + - name: Fail workflow if tests failed + if: steps.test.outputs.test-status != 'success' + run: | + echo "❌ GCP tests failed" + exit 1 diff --git a/ci/CI_SETUP.md b/ci/CI_SETUP.md new file mode 100644 index 00000000..bd70e0aa --- /dev/null +++ b/ci/CI_SETUP.md @@ -0,0 +1,520 @@ +# Zathras CI/CD Setup Guide + +This guide explains how to configure GitHub Actions CI/CD for testing Zathras across AWS, Azure, GCP, and bare metal systems. + +## Table of Contents + +- [Overview](#overview) +- [Prerequisites](#prerequisites) +- [AWS Setup](#aws-setup) +- [Azure Setup](#azure-setup) +- [GCP Setup](#gcp-setup) +- [Bare Metal Setup](#bare-metal-setup) +- [GitHub Secrets Configuration](#github-secrets-configuration) +- [Testing the Workflows](#testing-the-workflows) +- [Troubleshooting](#troubleshooting) + +## Overview + +The Zathras CI pipeline consists of: + +- **4 cloud-specific workflows**: AWS, Azure, GCP, and bare metal +- **Automated testing**: Triggered on PRs to main branch +- **Manual testing**: Can be triggered via GitHub Actions UI +- **OIDC authentication**: Secure, keyless authentication to cloud providers +- **Standard test suite**: Runs linpack, streams, and fio benchmarks (~15-20 min per cloud) + +## Prerequisites + +- GitHub repository with Actions enabled +- Admin access to configure secrets +- Cloud provider accounts (AWS, Azure, GCP) +- (Optional) Self-hosted runner for bare metal testing + +## AWS Setup + +### 1. Create an IAM Role for GitHub OIDC + +Create an IAM role that GitHub Actions can assume using OIDC: + +```bash +# 1. Create the OIDC identity provider (one-time setup) +aws iam create-open-id-connect-provider \ + --url https://token.actions.githubusercontent.com \ + --client-id-list sts.amazonaws.com \ + --thumbprint-list 6938fd4d98bab03faadb97b34396831e3780aea1 + +# 2. Create a trust policy file +cat > trust-policy.json << 'EOF' +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam::ACCOUNT_ID:oidc-provider/token.actions.githubusercontent.com" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "token.actions.githubusercontent.com:aud": "sts.amazonaws.com" + }, + "StringLike": { + "token.actions.githubusercontent.com:sub": "repo:YOUR_ORG/zathras:*" + } + } + } + ] +} +EOF + +# Replace ACCOUNT_ID and YOUR_ORG with your values +sed -i "s/ACCOUNT_ID/$(aws sts get-caller-identity --query Account --output text)/" trust-policy.json +sed -i "s/YOUR_ORG/your-github-org/" trust-policy.json + +# 3. Create the IAM role +aws iam create-role \ + --role-name GitHubActionsZathrasRole \ + --assume-role-policy-document file://trust-policy.json + +# 4. Attach necessary permissions +# Create a policy with required EC2, VPC, and S3 permissions +cat > zathras-permissions.json << 'EOF' +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:RunInstances", + "ec2:TerminateInstances", + "ec2:DescribeInstances", + "ec2:DescribeImages", + "ec2:DescribeKeyPairs", + "ec2:DescribeSecurityGroups", + "ec2:DescribeSubnets", + "ec2:DescribeVpcs", + "ec2:CreateSecurityGroup", + "ec2:DeleteSecurityGroup", + "ec2:AuthorizeSecurityGroupIngress", + "ec2:AuthorizeSecurityGroupEgress", + "ec2:RevokeSecurityGroupIngress", + "ec2:RevokeSecurityGroupEgress", + "ec2:CreateTags", + "ec2:CreateKeyPair", + "ec2:DeleteKeyPair", + "ec2:DescribeVolumes", + "ec2:CreateVolume", + "ec2:DeleteVolume", + "ec2:AttachVolume", + "ec2:DetachVolume" + ], + "Resource": "*" + } + ] +} +EOF + +aws iam put-role-policy \ + --role-name GitHubActionsZathrasRole \ + --policy-name ZathrasTestPermissions \ + --policy-document file://zathras-permissions.json + +# 5. Get the role ARN (save this for GitHub secrets) +aws iam get-role --role-name GitHubActionsZathrasRole --query 'Role.Arn' --output text +``` + +### 2. Create SSH Key Pair + +```bash +# Generate SSH key for EC2 instances +ssh-keygen -t rsa -b 4096 -f ~/.ssh/zathras_ci_key -N "" + +# The private key content will be added to GitHub secrets +cat ~/.ssh/zathras_ci_key +``` + +### 3. Required GitHub Secrets for AWS + +Add these secrets in **Settings > Secrets and variables > Actions**: + +| Secret Name | Description | Example Value | +|-------------|-------------|---------------| +| `AWS_OIDC_ROLE_ARN` | ARN of the IAM role created above | `arn:aws:iam::123456789012:role/GitHubActionsZathrasRole` | +| `AWS_SSH_PRIVATE_KEY` | Private SSH key content | Content of `~/.ssh/zathras_ci_key` | + +## Azure Setup + +### 1. Create Azure Service Principal with OIDC + +```bash +# 1. Set variables +SUBSCRIPTION_ID=$(az account show --query id -o tsv) +RESOURCE_GROUP="zathras-ci-rg" # Optional: Create dedicated RG for CI +GITHUB_ORG="your-org" +GITHUB_REPO="zathras" + +# 2. Create an app registration +APP_ID=$(az ad app create \ + --display-name "GitHub Actions Zathras CI" \ + --query appId -o tsv) + +# 3. Create a service principal +az ad sp create --id $APP_ID + +# 4. Configure federated credentials for GitHub OIDC +az ad app federated-credential create \ + --id $APP_ID \ + --parameters '{ + "name": "GitHubActionsZathras", + "issuer": "https://token.actions.githubusercontent.com", + "subject": "repo:'"$GITHUB_ORG"'/'"$GITHUB_REPO"':ref:refs/heads/main", + "audiences": ["api://AzureADTokenExchange"] + }' + +# Also add credentials for PRs +az ad app federated-credential create \ + --id $APP_ID \ + --parameters '{ + "name": "GitHubActionsZathrasPR", + "issuer": "https://token.actions.githubusercontent.com", + "subject": "repo:'"$GITHUB_ORG"'/'"$GITHUB_REPO"':pull_request", + "audiences": ["api://AzureADTokenExchange"] + }' + +# 5. Assign Contributor role to the service principal +az role assignment create \ + --assignee $APP_ID \ + --role Contributor \ + --scope /subscriptions/$SUBSCRIPTION_ID + +# 6. Get the Tenant ID and Client ID (save for GitHub secrets) +echo "Client ID: $APP_ID" +echo "Tenant ID: $(az account show --query tenantId -o tsv)" +echo "Subscription ID: $SUBSCRIPTION_ID" +``` + +### 2. Create SSH Key Pair + +```bash +# Generate SSH key for Azure VMs +ssh-keygen -t rsa -b 4096 -f ~/.ssh/zathras_azure_ci_key -N "" + +cat ~/.ssh/zathras_azure_ci_key +``` + +### 3. Required GitHub Secrets for Azure + +| Secret Name | Description | Example Value | +|-------------|-------------|---------------| +| `AZURE_CLIENT_ID` | Application (client) ID | `12345678-1234-1234-1234-123456789abc` | +| `AZURE_TENANT_ID` | Directory (tenant) ID | `87654321-4321-4321-4321-cba987654321` | +| `AZURE_SUBSCRIPTION_ID` | Subscription ID | `abcdef01-2345-6789-abcd-ef0123456789` | +| `AZURE_SSH_PRIVATE_KEY` | Private SSH key content | Content of `~/.ssh/zathras_azure_ci_key` | + +## GCP Setup + +### 1. Create Workload Identity Federation + +```bash +# 1. Set variables +PROJECT_ID="your-gcp-project-id" +GITHUB_ORG="your-org" +GITHUB_REPO="zathras" +SERVICE_ACCOUNT_NAME="github-actions-zathras" + +# 2. Create a service account +gcloud iam service-accounts create $SERVICE_ACCOUNT_NAME \ + --display-name="GitHub Actions Zathras CI" \ + --project=$PROJECT_ID + +# 3. Grant necessary permissions to the service account +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:${SERVICE_ACCOUNT_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" \ + --role="roles/compute.admin" + +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:${SERVICE_ACCOUNT_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" \ + --role="roles/iam.serviceAccountUser" + +# 4. Create Workload Identity Pool +gcloud iam workload-identity-pools create "github-actions-pool" \ + --project=$PROJECT_ID \ + --location="global" \ + --display-name="GitHub Actions Pool" + +# 5. Create Workload Identity Provider +gcloud iam workload-identity-pools providers create-oidc "github-provider" \ + --project=$PROJECT_ID \ + --location="global" \ + --workload-identity-pool="github-actions-pool" \ + --display-name="GitHub Provider" \ + --attribute-mapping="google.subject=assertion.sub,attribute.actor=assertion.actor,attribute.repository=assertion.repository" \ + --issuer-uri="https://token.actions.githubusercontent.com" + +# 6. Allow the GitHub repo to impersonate the service account +gcloud iam service-accounts add-iam-policy-binding \ + "${SERVICE_ACCOUNT_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" \ + --project=$PROJECT_ID \ + --role="roles/iam.workloadIdentityUser" \ + --member="principalSet://iam.googleapis.com/projects/$(gcloud projects describe $PROJECT_ID --format='value(projectNumber)')/locations/global/workloadIdentityPools/github-actions-pool/attribute.repository/${GITHUB_ORG}/${GITHUB_REPO}" + +# 7. Get the Workload Identity Provider resource name (save for GitHub secrets) +echo "Workload Identity Provider:" +gcloud iam workload-identity-pools providers describe "github-provider" \ + --project=$PROJECT_ID \ + --location="global" \ + --workload-identity-pool="github-actions-pool" \ + --format="value(name)" + +echo "" +echo "Service Account:" +echo "${SERVICE_ACCOUNT_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" +``` + +### 2. Create SSH Key Pair + +```bash +# Generate SSH key for GCP instances +ssh-keygen -t rsa -b 4096 -f ~/.ssh/zathras_gcp_ci_key -N "" + +cat ~/.ssh/zathras_gcp_ci_key +``` + +### 3. Required GitHub Secrets for GCP + +| Secret Name | Description | Example Value | +|-------------|-------------|---------------| +| `GCP_WORKLOAD_IDENTITY_PROVIDER` | Workload Identity Provider resource name | `projects/123456789/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider` | +| `GCP_SERVICE_ACCOUNT` | Service account email | `github-actions-zathras@your-project.iam.gserviceaccount.com` | +| `GCP_PROJECT_ID` | GCP Project ID | `your-gcp-project-id` | +| `GCP_SSH_PRIVATE_KEY` | Private SSH key content | Content of `~/.ssh/zathras_gcp_ci_key` | + +## Bare Metal Setup + +Bare metal testing requires a self-hosted GitHub Actions runner. + +### 1. Set Up Self-Hosted Runner + +Follow [GitHub's documentation](https://docs.github.com/en/actions/hosting-your-own-runners/adding-self-hosted-runners) to add a self-hosted runner. + +**Recommended configuration:** +- **Labels**: `self-hosted`, `linux`, `baremetal` +- **Runner name**: Something descriptive like `baremetal-ci-runner` + +### 2. Install Dependencies on Runner + +```bash +# On the self-hosted runner machine, run Zathras installation +cd /path/to/zathras +sudo ./bin/install.sh +``` + +### 3. Configure SSH Access + +The runner needs passwordless SSH access to target bare metal systems: + +```bash +# On the runner machine +ssh-keygen -t rsa -b 4096 -f ~/.ssh/zathras_baremetal_key -N "" + +# Copy public key to target systems +ssh-copy-id -i ~/.ssh/zathras_baremetal_key user@target-host +``` + +### 4. Create Local Config Files + +Create local config files for each bare metal system in `local_configs/`: + +```bash +# Example: local_configs/baremetal-test-01.config +cat > local_configs/baremetal-test-01.config << 'EOF' +# Bare metal test system configuration +hostname=baremetal-test-01 +ip_address=192.168.1.100 +ssh_user=root +# Add other system-specific configuration +EOF +``` + +### 5. Required GitHub Secrets for Bare Metal + +| Secret Name | Description | Example Value | +|-------------|-------------|---------------| +| `BAREMETAL_SSH_PRIVATE_KEY` | (Optional) SSH private key if not using runner's default | Content of SSH private key | + +## GitHub Secrets Configuration + +### Adding Secrets to GitHub + +1. Go to your repository on GitHub +2. Navigate to **Settings > Secrets and variables > Actions** +3. Click **New repository secret** +4. Add each secret listed above with its corresponding value + +### Secret Summary + +Here's a complete list of all secrets needed: + +**AWS:** +- `AWS_OIDC_ROLE_ARN` +- `AWS_SSH_PRIVATE_KEY` + +**Azure:** +- `AZURE_CLIENT_ID` +- `AZURE_TENANT_ID` +- `AZURE_SUBSCRIPTION_ID` +- `AZURE_SSH_PRIVATE_KEY` + +**GCP:** +- `GCP_WORKLOAD_IDENTITY_PROVIDER` +- `GCP_SERVICE_ACCOUNT` +- `GCP_PROJECT_ID` +- `GCP_SSH_PRIVATE_KEY` + +**Bare Metal:** +- `BAREMETAL_SSH_PRIVATE_KEY` (optional) + +## Testing the Workflows + +### Manual Testing + +1. Go to **Actions** tab in your GitHub repository +2. Select one of the workflows (e.g., "Test AWS") +3. Click **Run workflow** +4. Fill in any required inputs (or use defaults) +5. Click **Run workflow** button +6. Monitor the workflow execution + +### Automatic Testing (PRs) + +When you create a pull request that modifies relevant files, the workflows will automatically run. + +**Monitored paths:** +- `bin/**` - Zathras core scripts +- `ansible_roles/**` - Ansible roles +- `config/**` - Configuration files +- `ci/test_scenarios/*.yml` - Test scenarios +- `.github/workflows/*.yml` - Workflow definitions +- `.github/actions/**` - Composite actions + +### Expected Results + +Each workflow should: +1. ✅ Authenticate to the cloud provider (or verify SSH for bare metal) +2. ✅ Set up Zathras and dependencies +3. ✅ Provision cloud resources (or connect to bare metal) +4. ✅ Run test suite (linpack, streams, fio) +5. ✅ Validate results +6. ✅ Clean up resources +7. ✅ Upload test results as artifacts +8. ✅ Post results comment on PR (if triggered by PR) + +**Total runtime:** ~15-20 minutes per cloud provider + +## Troubleshooting + +### AWS Issues + +**Problem:** `An error occurred (UnauthorizedOperation) when calling the RunInstances operation` + +**Solution:** Check that the IAM role has the required EC2 permissions listed in the setup section. + +--- + +**Problem:** `Unable to locate credentials` + +**Solution:** Verify that `AWS_OIDC_ROLE_ARN` secret is set correctly and the trust policy allows your repository. + +### Azure Issues + +**Problem:** `AADSTS70021: No matching federated identity record found` + +**Solution:** Ensure federated credentials are created for both `ref:refs/heads/main` and `pull_request` subjects. + +--- + +**Problem:** `Authorization failed` + +**Solution:** Verify the service principal has Contributor role on the subscription. + +### GCP Issues + +**Problem:** `Failed to impersonate service account` + +**Solution:** Check that the Workload Identity Pool allows your repository and the service account has `roles/iam.workloadIdentityUser`. + +--- + +**Problem:** `Permission denied (compute.instances.create)` + +**Solution:** Ensure the service account has `roles/compute.admin` role. + +### Bare Metal Issues + +**Problem:** `No runner available` + +**Solution:** Ensure a self-hosted runner with the `baremetal` label is online and idle. + +--- + +**Problem:** `SSH connection failed` + +**Solution:** Verify SSH keys are configured correctly and the runner can reach the target host. + +--- + +**Problem:** `Local config file not found` + +**Solution:** Create a local config file in `local_configs/.config`. + +### General Issues + +**Problem:** Tests timing out + +**Solution:** Increase the `timeout-minutes` value in the workflow file or optimize test parameters in scenario files. + +--- + +**Problem:** Results not uploaded + +**Solution:** Check that the results directory exists and contains files. The workflow will upload artifacts even on failure. + +--- + +**Problem:** Resources not cleaned up + +**Solution:** Check the "Check for orphaned resources" step in workflow logs. Manually clean up using cloud provider console. + +## Cost Optimization + +To minimize CI costs: + +1. **Use spot/preemptible instances**: Already configured in test scenarios +2. **Limit concurrent workflows**: Configure workflow concurrency in `.github/workflows/` +3. **Manual triggers**: Use `workflow_dispatch` for expensive tests +4. **Smaller instances**: Adjust `host_config` in scenario files for cheaper instance types +5. **Shorter tests**: Reduce benchmark run times in scenario files +6. **Scheduled cleanup**: Set up scheduled workflows to scan for orphaned resources + +## Next Steps + +- [ ] Configure all cloud provider OIDC integrations +- [ ] Add GitHub secrets +- [ ] Test each workflow manually +- [ ] Set up self-hosted runner for bare metal (if needed) +- [ ] Monitor first automated PR runs +- [ ] Adjust timeout values and test parameters as needed +- [ ] Set up cost monitoring alerts in cloud providers + +## Support + +For issues or questions: +- Check workflow logs in GitHub Actions +- Review this documentation +- Open an issue in the repository +- Consult cloud provider documentation for OIDC setup + +--- + +**Last updated:** 2025-12-09 diff --git a/ci/README.md b/ci/README.md index f20d3d61..965d731f 100644 --- a/ci/README.md +++ b/ci/README.md @@ -1,43 +1,115 @@ -# Workflows in this repository +# Zathras CI/CD Infrastructure -## Verify group review +This directory contains CI/CD infrastructure for Zathras, including: +1. **PR Management Workflows** - JIRA integration and review requirements +2. **Test Workflows** - Automated testing across clouds and bare metal + +--- + +## PR Management Workflows + +### Verify group review This workflow requires PRs to have an additional label before they can be merged into the main branch. Currently the label is `group_review_lgtm`, which is intended to be issued once the PR has been through a group review. This can be done via the GitHub CLI by using the command `gh pr edit --add-label group_review_lgtm`. -## Verify PR has JIRA ticket and issue number +### Verify PR has JIRA ticket and issue number This workflow is designed to enforce requirements for PR descriptions. At bare minimum it requires the PR to mention a related issue and mention the Jira Ticket number. Both of these are required since Sync2Jira does not know how to associate a PR with a Jira Ticket from the originating GitHub issue. -## Update parent issue +### Update parent issue The idea behind this workflow is to keep Jira tickets in sync with the current status of their GitHub issue. A flowchart for how this works can be seen below. ![flow chart for PR labelling workflow](images/pr_labelling.jpg) This workflow does not work with forked repositories, since the `GITHUB_TOKEN` provided by GitHub runner will not have write access to the base repository unless the pull request originated from the base repository. -# Container -The container image build in [issue-tagging-container] is meant to provide CI helper scripts around to other repositories that +--- + +## Test Workflows + +**NEW:** Automated testing workflows for validating Zathras across multiple environments. + +### Available Test Workflows + +Located in `.github/workflows/`: + +- **test-aws.yml** - AWS cloud testing workflow +- **test-azure.yml** - Azure cloud testing workflow +- **test-gcp.yml** - GCP cloud testing workflow +- **test-baremetal.yml** - Bare metal testing workflow + +### Features + +✅ **Multi-cloud support** - AWS, Azure, GCP, and bare metal +✅ **OIDC authentication** - Secure, keyless authentication +✅ **Automated PR testing** - Runs on PRs that modify relevant code +✅ **Manual workflows** - Can be triggered on-demand +✅ **Cost optimized** - Uses spot instances, minimal resources +✅ **Automatic cleanup** - Terminates cloud resources after tests +✅ **Result artifacts** - Uploads test results for analysis +✅ **PR comments** - Posts test results to pull requests + +### Quick Start + +See **[CI_SETUP.md](CI_SETUP.md)** for complete setup instructions. + +**Required steps:** +1. Configure OIDC authentication for each cloud provider +2. Add GitHub secrets (API keys, SSH keys) +3. Test workflows manually +4. (Optional) Set up self-hosted runner for bare metal + +### Test Scenarios + +Test scenarios are located in `test_scenarios/`: +- `aws_ci_test.yml` - AWS configuration +- `azure_ci_test.yml` - Azure configuration +- `gcp_ci_test.yml` - GCP configuration +- `baremetal_ci_test.yml` - Bare metal configuration + +Each scenario runs a standard test suite: **linpack**, **streams**, and **fio** benchmarks. + +**Total runtime:** ~15-20 minutes per cloud + +### Documentation + +- **[CI_SETUP.md](CI_SETUP.md)** - Complete setup guide with OIDC configuration +- **[test_scenarios/README.md](test_scenarios/README.md)** - Test scenario documentation + +--- + +## CI Helper Container + +The container image built in [issue-tagging-container] is meant to provide CI helper scripts to other repositories that reuse the workflows in this repository. All scripts are kept in the `/opt/tools` directory within the container. -## get_parent_issue.sh +### get_parent_issue.sh This script fetches any parent issues mentioned in a PR. It will output a space separated list of issue numbers. -### Usage +**Usage:** `./get_parent_issue.sh ` -## determine_status.py +### determine_status.py This script will determine the target status of a PR by -looking at the review state. If any reviews request -changes, it will return "in progress", then if any -reviews are pending, it will return "review", if all +looking at the review state. If any reviews request +changes, it will return "in progress", then if any +reviews are pending, it will return "review", if all reviews approve the PR, it will return "approved". -### Usage +**Usage:** `python3 determine_status.py ` OR `gh pr view --json reviewRequests,latestReviews | python3 determine_status.py` + +--- + +## Support + +- **Documentation**: See [CI_SETUP.md](CI_SETUP.md) for test workflow setup +- **Issues**: Open a GitHub issue +- **Questions**: Contact the Zathras maintainers diff --git a/ci/test_scenarios/README.md b/ci/test_scenarios/README.md new file mode 100644 index 00000000..d8b24e66 --- /dev/null +++ b/ci/test_scenarios/README.md @@ -0,0 +1,45 @@ +# CI Test Scenarios + +This directory contains test scenario files used by GitHub Actions workflows to validate Zathras across different cloud providers and bare metal systems. + +## Scenario Files + +- **aws_ci_test.yml** - AWS cloud testing scenario +- **azure_ci_test.yml** - Azure cloud testing scenario +- **gcp_ci_test.yml** - GCP cloud testing scenario +- **baremetal_ci_test.yml** - Bare metal/local system testing scenario + +## Test Coverage + +Each scenario runs a standard test suite consisting of: +- **linpack** - CPU performance benchmark (5 min) +- **streams** - Memory bandwidth benchmark (3 min) +- **fio** - I/O performance benchmark (4 min total for read/write/randread/randwrite) + +Total estimated runtime per cloud: **~15-20 minutes** (including provisioning and cleanup) + +## Configuration + +Cloud scenarios are configured to: +- Use medium-sized instances (cost-effective for testing) +- Enable spot/preemptible instances where possible (cost optimization) +- Use minimal disk sizes (100GB) +- Automatically terminate cloud resources after testing +- Run shortened benchmark durations (faster feedback) + +## Customization + +To modify test parameters: +1. Edit the relevant scenario file +2. Adjust `linpack_run_time`, `streams_run_time`, or `fio_runtime` values +3. Add/remove tests from the `tests:` list +4. Modify instance types in `host_config:` if needed + +## Usage in CI + +GitHub Actions workflows use these scenarios with runtime parameter substitution: +- Cloud OS IDs are set via environment variables +- SSH keys are configured from GitHub secrets +- Region/zone can be overridden for geographic testing + +See `.github/workflows/test-*.yml` for implementation details. diff --git a/ci/test_scenarios/aws_ci_test.yml b/ci/test_scenarios/aws_ci_test.yml new file mode 100644 index 00000000..f8a6b91f --- /dev/null +++ b/ci/test_scenarios/aws_ci_test.yml @@ -0,0 +1,33 @@ +# AWS CI Test Scenario +# This scenario is used by GitHub Actions to validate Zathras on AWS +# Runs a standard test suite: linpack, streams, and fio benchmarks + +global: + system_type: aws + # AWS OS ID should be set via environment variable or passed as parameter + # cloud_os_id: ami-xxxxx (set in workflow) + # SSH key file path (set in workflow) + # ssh_key_file: /path/to/key + terminate_cloud: 1 # Always clean up resources after testing + cloud_region: us-east-1 + cloud_zone: us-east-1a + # Use spot instances for cost savings in CI + use_spot: 1 + spot_max_price: 0.50 + +systems: + ci_test_system: + # Standard medium instance for testing + host_config: "m5.xlarge:Disks;number=1;size=100;type=gp3" + tests: linpack,streams,fio + + # Test-specific overrides for faster CI runs + linpack: + linpack_run_time: 300 # 5 minutes instead of default + + streams: + streams_run_time: 180 # 3 minutes + + fio: + fio_runtime: 60 # 1 minute per test + fio_test_types: "read,write,randread,randwrite" # Standard I/O patterns diff --git a/ci/test_scenarios/azure_ci_test.yml b/ci/test_scenarios/azure_ci_test.yml new file mode 100644 index 00000000..7155c418 --- /dev/null +++ b/ci/test_scenarios/azure_ci_test.yml @@ -0,0 +1,33 @@ +# Azure CI Test Scenario +# This scenario is used by GitHub Actions to validate Zathras on Azure +# Runs a standard test suite: linpack, streams, and fio benchmarks + +global: + system_type: azure + # Azure OS ID should be set via environment variable or passed as parameter + # cloud_os_id: /subscriptions/.../resourceGroups/.../images/xxx (set in workflow) + # OR use URN format: Publisher:Offer:Sku:Version + # ssh_key_file: /path/to/key + terminate_cloud: 1 # Always clean up resources after testing + cloud_region: eastus + cloud_zone: 1 + # Use spot instances for cost savings in CI + use_spot: 1 + spot_max_price: 0.50 + +systems: + ci_test_system: + # Standard medium instance for testing + host_config: "Standard_D4s_v3:Disks;number=1;size=100;type=Premium_LRS" + tests: linpack,streams,fio + + # Test-specific overrides for faster CI runs + linpack: + linpack_run_time: 300 # 5 minutes instead of default + + streams: + streams_run_time: 180 # 3 minutes + + fio: + fio_runtime: 60 # 1 minute per test + fio_test_types: "read,write,randread,randwrite" # Standard I/O patterns diff --git a/ci/test_scenarios/baremetal_ci_test.yml b/ci/test_scenarios/baremetal_ci_test.yml new file mode 100644 index 00000000..4095f81f --- /dev/null +++ b/ci/test_scenarios/baremetal_ci_test.yml @@ -0,0 +1,33 @@ +# Bare Metal CI Test Scenario +# This scenario is used by GitHub Actions to validate Zathras on bare metal/local systems +# Runs a standard test suite: linpack, streams, and fio benchmarks +# +# REQUIREMENTS: +# - Self-hosted GitHub Actions runner with SSH access to target systems +# - Local config file in local_configs/.config +# - SSH key configured for passwordless access + +global: + system_type: local + # No cloud_os_id needed for local systems + # SSH key file path (set in workflow) + # ssh_key_file: /path/to/key + terminate_cloud: 0 # Not applicable for bare metal + +systems: + ci_test_system: + # Hostname should be set via environment variable or passed as parameter + # This should match a config file in local_configs/.config + # host_config: (set in workflow or use runner hostname) + tests: linpack,streams,fio + + # Test-specific overrides for faster CI runs + linpack: + linpack_run_time: 300 # 5 minutes instead of default + + streams: + streams_run_time: 180 # 3 minutes + + fio: + fio_runtime: 60 # 1 minute per test + fio_test_types: "read,write,randread,randwrite" # Standard I/O patterns diff --git a/ci/test_scenarios/gcp_ci_test.yml b/ci/test_scenarios/gcp_ci_test.yml new file mode 100644 index 00000000..46ed5e33 --- /dev/null +++ b/ci/test_scenarios/gcp_ci_test.yml @@ -0,0 +1,31 @@ +# GCP CI Test Scenario +# This scenario is used by GitHub Actions to validate Zathras on GCP +# Runs a standard test suite: linpack, streams, and fio benchmarks + +global: + system_type: gcp + # GCP OS ID should be set via environment variable or passed as parameter + # cloud_os_id: projects/rhel-cloud/global/images/family/rhel-9 (set in workflow) + # ssh_key_file: /path/to/key + terminate_cloud: 1 # Always clean up resources after testing + cloud_region: us-central1 + cloud_zone: us-central1-a + # GCP doesn't use spot_max_price the same way, but we can use preemptible instances + use_spot: 1 + +systems: + ci_test_system: + # Standard medium instance for testing + host_config: "n2-standard-4:Disks;number=1;size=100;type=pd-ssd" + tests: linpack,streams,fio + + # Test-specific overrides for faster CI runs + linpack: + linpack_run_time: 300 # 5 minutes instead of default + + streams: + streams_run_time: 180 # 3 minutes + + fio: + fio_runtime: 60 # 1 minute per test + fio_test_types: "read,write,randread,randwrite" # Standard I/O patterns