diff --git a/.github/workflows/pxf-ci.yml b/.github/workflows/pxf-ci.yml index 1195d060..331d7bac 100644 --- a/.github/workflows/pxf-ci.yml +++ b/.github/workflows/pxf-ci.yml @@ -146,7 +146,15 @@ jobs: with: path: cloudberry-pxf + - name: Cache singlecluster image + id: cache-image + uses: actions/cache@v4 + with: + path: /tmp/singlecluster-image.tar + key: singlecluster-ubuntu-${{ hashFiles('ci/singlecluster/**') }} + - name: Build singlecluster image + if: steps.cache-image.outputs.cache-hit != 'true' run: | cd cloudberry-pxf/ci/singlecluster docker build -t pxf/singlecluster:3 . @@ -185,7 +193,15 @@ jobs: with: path: cloudberry-pxf + - name: Cache singlecluster Rocky 9 image + id: cache-image-rocky9 + uses: actions/cache@v4 + with: + path: /tmp/singlecluster-rocky9-image.tar + key: singlecluster-rocky9-${{ hashFiles('ci/singlecluster/**') }} + - name: Build singlecluster Rocky 9 image + if: steps.cache-image-rocky9.outputs.cache-hit != 'true' run: | cd cloudberry-pxf/ci/singlecluster docker build --build-arg BASE_IMAGE=apache/incubator-cloudberry:cbdb-build-rocky9-latest -t pxf/singlecluster-rocky9:3 . @@ -198,10 +214,109 @@ jobs: path: /tmp/singlecluster-rocky9-image.tar retention-days: 1 + # Stage 1.5: Build test-ready images (Cloudberry + packages pre-installed) + build-test-image: + name: Build Test-Ready Image (Ubuntu) + needs: [build-cloudberry-deb, build-docker-images] + runs-on: ubuntu-latest + steps: + - name: Checkout PXF source + uses: actions/checkout@v4 + with: + path: cloudberry-pxf + + - name: Download singlecluster image + uses: actions/download-artifact@v4 + with: + name: singlecluster-image + path: /tmp + + - name: Download Cloudberry DEB + uses: actions/download-artifact@v4 + with: + name: cloudberry-deb + path: cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu/cloudberry-package/ + + - name: Download Cloudberry source + uses: actions/download-artifact@v4 + with: + name: cloudberry-source + path: /tmp + + - name: Prepare build context + run: | + docker load < /tmp/singlecluster-image.tar + mkdir -p cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu/cloudberry-source + tar xzf /tmp/cloudberry-source.tar.gz --strip-components=1 -C cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu/cloudberry-source/ + ls cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu/cloudberry-package/ + ls cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu/cloudberry-source/Makefile + + - name: Build test-ready image + run: | + cd cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu + docker build -f Dockerfile.test-ready -t pxf/test-ready-ubuntu:latest . + docker save pxf/test-ready-ubuntu:latest > /tmp/test-ready-ubuntu.tar + + - name: Upload test-ready image + uses: actions/upload-artifact@v4 + with: + name: test-ready-ubuntu-image + path: /tmp/test-ready-ubuntu.tar + retention-days: 1 + + build-test-image-rocky9: + name: Build Test-Ready Image (Rocky 9) + needs: [build-cloudberry-rpm, build-docker-images-rocky9] + runs-on: ubuntu-latest + steps: + - name: Checkout PXF source + uses: actions/checkout@v4 + with: + path: cloudberry-pxf + + - name: Download singlecluster Rocky 9 image + uses: actions/download-artifact@v4 + with: + name: singlecluster-rocky9-image + path: /tmp + + - name: Download Cloudberry RPM + uses: actions/download-artifact@v4 + with: + name: cloudberry-rpm + path: cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9/cloudberry-package/ + + - name: Download Cloudberry source (Rocky 9) + uses: actions/download-artifact@v4 + with: + name: cloudberry-source-rocky9 + path: /tmp + + - name: Prepare build context + run: | + docker load < /tmp/singlecluster-rocky9-image.tar + mkdir -p cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9/cloudberry-source + tar xzf /tmp/cloudberry-source-rocky9.tar.gz --strip-components=1 -C cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9/cloudberry-source/ + ls cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9/cloudberry-package/ + ls cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9/cloudberry-source/Makefile + + - name: Build test-ready image + run: | + cd cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9 + docker build -f Dockerfile.test-ready -t pxf/test-ready-rocky9:latest . + docker save pxf/test-ready-rocky9:latest > /tmp/test-ready-rocky9.tar + + - name: Upload test-ready image + uses: actions/upload-artifact@v4 + with: + name: test-ready-rocky9-image + path: /tmp/test-ready-rocky9.tar + retention-days: 1 + # Stage 2: Parallel test jobs using matrix strategy pxf-test: name: Test PXF - ${{ matrix.test_group }} - needs: [build-cloudberry-deb, build-docker-images] + needs: [build-test-image] runs-on: ubuntu-latest strategy: fail-fast: false @@ -245,44 +360,22 @@ jobs: path: cloudberry-pxf submodules: true - - name: Download Cloudberry DEB - uses: actions/download-artifact@v4 - with: - name: cloudberry-deb - path: /tmp - - - name: Download Cloudberry source - uses: actions/download-artifact@v4 - with: - name: cloudberry-source - path: /tmp - - - name: Download singlecluster image + - name: Download test-ready image uses: actions/download-artifact@v4 with: - name: singlecluster-image + name: test-ready-ubuntu-image path: /tmp - - name: Load singlecluster image - run: | - docker load < /tmp/singlecluster-image.tar - - - name: Prepare Cloudberry source - run: | - tar xzf /tmp/cloudberry-source.tar.gz - chmod -R u+rwX,go+rX cloudberry + - name: Load test-ready image + run: docker load < /tmp/test-ready-ubuntu.tar - name: Start Services id: start_services run: | cd cloudberry-pxf docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml down -v || true - docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml build docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml up -d - docker exec pxf-cbdb-dev sudo chown -R gpadmin:gpadmin /home/gpadmin/workspace/cloudberry - docker cp /tmp/*.deb pxf-cbdb-dev:/tmp/ - docker exec pxf-cbdb-dev sudo chown gpadmin:gpadmin /tmp/*.deb - docker exec pxf-cbdb-dev bash -lc "cd /home/gpadmin/workspace/cloudberry-pxf && ./ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh" + docker exec pxf-cbdb-dev bash -lc "cd /home/gpadmin/workspace/cloudberry-pxf && ./ci/docker/pxf-cbdb-dev/common/script/entrypoint_fast.sh" - name: Run Test - ${{ matrix.test_group }} id: run_test @@ -364,15 +457,15 @@ jobs: FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}" SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}" - if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then - echo "Test group ${{ matrix.test_group }} failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)" + if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then + echo "Test group ${{ matrix.test_group }} failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)" exit 1 fi # Stage 2b: Rocky 9 parallel test jobs pxf-test-rocky9: name: Test PXF Rocky9 - ${{ matrix.test_group }} - needs: [build-cloudberry-rpm, build-docker-images-rocky9] + needs: [build-test-image-rocky9] runs-on: ubuntu-latest strategy: fail-fast: false @@ -417,44 +510,22 @@ jobs: path: cloudberry-pxf submodules: true - - name: Download Cloudberry RPM - uses: actions/download-artifact@v4 - with: - name: cloudberry-rpm - path: /tmp - - - name: Download Cloudberry source (Rocky 9) - uses: actions/download-artifact@v4 - with: - name: cloudberry-source-rocky9 - path: /tmp - - - name: Download singlecluster Rocky 9 image + - name: Download test-ready Rocky 9 image uses: actions/download-artifact@v4 with: - name: singlecluster-rocky9-image + name: test-ready-rocky9-image path: /tmp - - name: Load singlecluster Rocky 9 image - run: | - docker load < /tmp/singlecluster-rocky9-image.tar - - - name: Prepare Cloudberry source - run: | - tar xzf /tmp/cloudberry-source-rocky9.tar.gz - chmod -R u+rwX,go+rX cloudberry + - name: Load test-ready Rocky 9 image + run: docker load < /tmp/test-ready-rocky9.tar - name: Start Services id: start_services run: | cd cloudberry-pxf docker compose -f ci/docker/pxf-cbdb-dev/rocky9/docker-compose.yml down -v || true - docker compose -f ci/docker/pxf-cbdb-dev/rocky9/docker-compose.yml build docker compose -f ci/docker/pxf-cbdb-dev/rocky9/docker-compose.yml up -d - docker exec pxf-cbdb-dev sudo chown -R gpadmin:gpadmin /home/gpadmin/workspace/cloudberry - docker cp /tmp/*.rpm pxf-cbdb-dev:/tmp/ - docker exec pxf-cbdb-dev sudo chown gpadmin:gpadmin /tmp/*.rpm - docker exec pxf-cbdb-dev bash -lc "cd /home/gpadmin/workspace/cloudberry-pxf && ./ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh" + docker exec pxf-cbdb-dev bash -lc "cd /home/gpadmin/workspace/cloudberry-pxf && ./ci/docker/pxf-cbdb-dev/common/script/entrypoint_fast.sh" - name: Run Test - ${{ matrix.test_group }} id: run_test @@ -536,8 +607,8 @@ jobs: FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}" SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}" - if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then - echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)" + if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then + echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)" exit 1 fi diff --git a/automation/pom.xml b/automation/pom.xml index e294cac0..a779c9f9 100644 --- a/automation/pom.xml +++ b/automation/pom.xml @@ -62,6 +62,12 @@ -Xmx4096m 1 false + + + listener + listeners.RetryListener + + diff --git a/automation/src/main/java/listeners/RetryAnalyzer.java b/automation/src/main/java/listeners/RetryAnalyzer.java new file mode 100644 index 00000000..57515907 --- /dev/null +++ b/automation/src/main/java/listeners/RetryAnalyzer.java @@ -0,0 +1,65 @@ +package listeners; + +import org.testng.IRetryAnalyzer; +import org.testng.ITestResult; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Random; +import java.util.Set; + +/** + * Retries failed tests up to {@value MAX_RETRIES} times with exponential + * backoff to handle transient CI failures (e.g. HDFS timeouts on + * resource-constrained GitHub Actions runners). + * + *

Delay schedule: 3-8s, 6-16s, 12-32s (capped at 60s). + * + *

Tests that write data to HDFS without cleanup are excluded from retry + * because retrying would append duplicate data and cause row-count mismatches. + */ +public class RetryAnalyzer implements IRetryAnalyzer { + + private static final int MAX_RETRIES = 3; + private static final int BASE_MIN_MS = 3000; + private static final int BASE_MAX_MS = 8000; + private static final int MAX_DELAY_MS = 60000; + + /** Tests that accumulate data on retry — skip retrying these. */ + private static final Set NO_RETRY_TESTS = new HashSet<>(Arrays.asList( + "copyFromFileMultiBlockedDataNoCompression", + "copyFromFileMultiBlockedDataGZip", + "copyFromFileMultiBlockedDataBZip2" + )); + + private int retryCount = 0; + private final Random random = new Random(); + + @Override + public boolean retry(ITestResult result) { + String methodName = result.getMethod().getMethodName(); + if (NO_RETRY_TESTS.contains(methodName)) { + System.out.println("[RetryAnalyzer] Skipping retry for " + methodName + + " (write-without-cleanup test)"); + return false; + } + if (retryCount < MAX_RETRIES) { + retryCount++; + int multiplier = 1 << (retryCount - 1); // 1, 2, 4 + int minDelay = Math.min(BASE_MIN_MS * multiplier, MAX_DELAY_MS); + int maxDelay = Math.min(BASE_MAX_MS * multiplier, MAX_DELAY_MS); + int delay = minDelay + random.nextInt(maxDelay - minDelay + 1); + System.out.println("[RetryAnalyzer] Retrying failed test: " + + result.getTestClass().getName() + "." + methodName + + " after " + delay + "ms delay" + + " (attempt " + (retryCount + 1) + "/" + (MAX_RETRIES + 1) + ")"); + try { + Thread.sleep(delay); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + return true; + } + return false; + } +} diff --git a/automation/src/main/java/listeners/RetryListener.java b/automation/src/main/java/listeners/RetryListener.java new file mode 100644 index 00000000..8b2ca0b9 --- /dev/null +++ b/automation/src/main/java/listeners/RetryListener.java @@ -0,0 +1,26 @@ +package listeners; + +import org.testng.IAnnotationTransformer; +import org.testng.annotations.ITestAnnotation; + +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +/** + * Annotation transformer that attaches {@link RetryAnalyzer} to every + * test method that does not already have a retry analyzer configured. + *

+ * Register this listener in {@code @Listeners} on the base test class + * so all automation tests automatically get retry-on-failure behaviour. + */ +public class RetryListener implements IAnnotationTransformer { + + @Override + public void transform(ITestAnnotation annotation, Class testClass, + Constructor testConstructor, Method testMethod) { + // TestNG 6.x: getRetryAnalyzer() returns IRetryAnalyzer instance (null if unset) + if (annotation.getRetryAnalyzer() == null) { + annotation.setRetryAnalyzer(RetryAnalyzer.class); + } + } +} diff --git a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh index 832e5067..bbded9d4 100755 --- a/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh +++ b/ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh @@ -20,6 +20,12 @@ # -------------------------------------------------------------------- set -euo pipefail +# Force UTC timezone for the entire container session. PXF's Parquet INT96 +# converter uses ZoneId.systemDefault() (ParquetTypeConverter.java) which +# returns the OS timezone. Rocky 9 base images may ship with a non-UTC +# default, causing timestamp regressions in Parquet read/write tests. +export TZ=UTC + log() { echo "[entrypoint][$(date '+%F %T')] $*"; } die() { log "ERROR $*"; exit 1; } @@ -60,12 +66,16 @@ setup_locale_and_packages() { log "install base packages and locales" if [ "$OS_FAMILY" = "deb" ]; then sudo apt-get update - sudo apt-get install -y wget lsb-release locales maven unzip openssh-server iproute2 sudo \ + sudo apt-get install -y wget lsb-release locales maven unzip openssh-server iproute2 sudo psmisc \ openjdk-11-jre-headless openjdk-8-jre-headless sudo locale-gen en_US.UTF-8 ru_RU.CP1251 ru_RU.UTF-8 sudo update-locale LANG=en_US.UTF-8 else - sudo dnf install -y wget maven unzip openssh-server iproute sudo \ + # Disable broken repos that may exist in the base image (e.g. hpc-common) + for repo in hpc-common; do + sudo dnf config-manager --set-disabled "$repo" 2>/dev/null || true + done + sudo dnf install -y wget maven unzip openssh-server iproute sudo psmisc \ java-11-openjdk-headless java-1.8.0-openjdk-headless \ glibc-langpack-en glibc-locale-source sudo localedef -c -i en_US -f UTF-8 en_US.UTF-8 || true @@ -263,9 +273,14 @@ configure_pxf() { log "configure PXF" source "${COMMON_SCRIPTS}/pxf-env.sh" export PATH="$PXF_HOME/bin:$PATH" - export PXF_JVM_OPTS="-Xmx512m -Xms256m" + export PXF_JVM_OPTS="-Xmx512m -Xms256m -Duser.timezone=UTC" export PXF_HOST=localhost - echo "JAVA_HOME=${JAVA_BUILD}" >> "$PXF_BASE/conf/pxf-env.sh" + # Persist settings into pxf-env.sh so they survive `pxf restart` + cat >> "$PXF_BASE/conf/pxf-env.sh" <> "$PXF_BASE/conf/pxf-application.properties" cp -v "$PXF_HOME"/templates/{hdfs,mapred,yarn,core,hbase,hive}-site.xml "$PXF_BASE/servers/default" @@ -430,9 +445,13 @@ wait_for_datanode() { # Stop any zombie DataNode processes pkill -f "proc_datanode" 2>/dev/null || true sleep 2 + # Force-release DataNode ports + for port in 50010 50020 50075 50080; do + fuser -k ${port}/tcp 2>/dev/null || true + done + sleep 3 # Restart DataNode via the singlecluster script "${GPHD_ROOT}/bin/hadoop-datanode.sh" start 0 2>&1 || true - "${HADOOP_ROOT}/sbin/hadoop-daemon.sh" --config "${GPHD_ROOT}/storage/hadoop/datanode0/etc/hadoop" start datanode 2>&1 || true log "DataNode restart issued, waiting again..." fi done @@ -440,6 +459,43 @@ wait_for_datanode() { die "HDFS DataNode failed to start after ${max_attempts} attempts. Tez upload will fail without a running DataNode." } +wait_for_hbase() { + log "waiting for HBase RegionServer to become available..." + local max_wait=60 + for i in $(seq 1 ${max_wait}); do + if pgrep -f HRegionServer >/dev/null 2>&1; then + log "HBase RegionServer is running (after ${i}s), waiting 10s for stabilization..." + sleep 10 + if pgrep -f HRegionServer >/dev/null 2>&1; then + log "HBase RegionServer is stable" + return 0 + fi + log "HBase RegionServer died during stabilization" + break + fi + sleep 1 + done + # RegionServer didn't come up or crashed; try restarting HBase once + log "HBase RegionServer not stable, attempting restart..." + ${GPHD_ROOT}/bin/stop-hbase.sh 2>/dev/null || true + sleep 2 + ${GPHD_ROOT}/bin/start-hbase.sh 2>/dev/null || true + for i in $(seq 1 60); do + if pgrep -f HRegionServer >/dev/null 2>&1; then + log "HBase RegionServer is running after restart (after ${i}s), waiting 10s..." + sleep 10 + if pgrep -f HRegionServer >/dev/null 2>&1; then + log "HBase RegionServer is stable after restart" + return 0 + fi + log "WARN: HBase RegionServer died again during stabilization, continuing anyway" + return 0 + fi + sleep 1 + done + log "WARN: HBase RegionServer failed to start after restart, continuing anyway" +} + prepare_hadoop_stack() { log "prepare Hadoop/Hive/HBase stack" export JAVA_HOME="${JAVA_HADOOP}" @@ -468,6 +524,13 @@ prepare_hadoop_stack() { log "initializing HDFS namenode..." ${GPHD_ROOT}/bin/init-gphd.sh 2>&1 || log "init-gphd.sh failed with exit code $?" fi + # Force-release DataNode ports before starting HDFS to prevent BindException. + # On CI re-runs or slow runners, stale sockets/processes may hold these ports. + log "ensuring DataNode ports are free..." + for port in 50010 50020 50075 50080; do + fuser -k ${port}/tcp 2>/dev/null || true + done + sleep 1 log "starting HDFS/YARN/HBase via start-gphd.sh..." if ! ${GPHD_ROOT}/bin/start-gphd.sh 2>&1; then log "start-gphd.sh returned non-zero (services may already be running), continue" @@ -482,6 +545,7 @@ prepare_hadoop_stack() { if ! ${GPHD_ROOT}/bin/start-hbase.sh; then log "start-hbase.sh returned non-zero (services may already be running), continue" fi + wait_for_hbase start_hive_services } diff --git a/ci/docker/pxf-cbdb-dev/common/script/entrypoint_fast.sh b/ci/docker/pxf-cbdb-dev/common/script/entrypoint_fast.sh new file mode 100755 index 00000000..aa3c9d47 --- /dev/null +++ b/ci/docker/pxf-cbdb-dev/common/script/entrypoint_fast.sh @@ -0,0 +1,378 @@ +#!/bin/bash +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Fast entrypoint for test-ready Docker images. +# Skips package installs, SSH setup, Cloudberry install, and demo cluster +# creation (all pre-baked into the image). Only runs: +# 1. Start sshd +# 2. Start Cloudberry cluster (gpstart -a) +# 3. Build PXF (dynamic, changes per PR) +# 4. Configure PXF +# 5. Start Hadoop/Hive/HBase services +# 6. Start MinIO +# 7. Health check +# -------------------------------------------------------------------- +set -euo pipefail + +export TZ=UTC + +log() { echo "[entrypoint-fast][$(date '+%F %T')] $*"; } +die() { log "ERROR $*"; exit 1; } + +ROOT_DIR=/home/gpadmin/workspace +REPO_DIR=${ROOT_DIR}/cloudberry-pxf +GPHD_ROOT=${ROOT_DIR}/singlecluster +COMMON_SCRIPTS=${REPO_DIR}/ci/docker/pxf-cbdb-dev/common/script +source "${COMMON_SCRIPTS}/utils.sh" + +HADOOP_ROOT=${GPHD_ROOT}/hadoop +HIVE_ROOT=${GPHD_ROOT}/hive +HBASE_ROOT=${GPHD_ROOT}/hbase +ZOOKEEPER_ROOT=${GPHD_ROOT}/zookeeper + +# Fallback: if not a test-ready image, use the full entrypoint +if [ ! -f /etc/pxf-test-ready ]; then + log "Not a test-ready image, falling back to full entrypoint" + exec "${COMMON_SCRIPTS}/entrypoint.sh" "$@" +fi + +# ---- OS detection ---- +if command -v apt-get >/dev/null 2>&1; then + OS_FAMILY="deb" +else + OS_FAMILY="rpm" +fi + +detect_java_paths() { + if [ "$OS_FAMILY" = "deb" ]; then + case "$(uname -m)" in + aarch64|arm64) JAVA_BUILD=/usr/lib/jvm/java-11-openjdk-arm64; JAVA_HADOOP=/usr/lib/jvm/java-8-openjdk-arm64 ;; + *) JAVA_BUILD=/usr/lib/jvm/java-11-openjdk-amd64; JAVA_HADOOP=/usr/lib/jvm/java-8-openjdk-amd64 ;; + esac + else + JAVA_BUILD=/usr/lib/jvm/java-11-openjdk + JAVA_HADOOP=/usr/lib/jvm/java-1.8.0-openjdk + fi + export JAVA_BUILD JAVA_HADOOP +} + +setup_ssh() { + log "configure ssh" + # Reuse the full SSH setup from the original entrypoint — only takes 2-3s + # and avoids subtle issues with pre-baked SSH config (key mismatches, etc.) + if [ "$OS_FAMILY" = "rpm" ] && command -v update-crypto-policies >/dev/null 2>&1; then + log "setting LEGACY crypto policy for SSH compatibility" + sudo update-crypto-policies --set LEGACY 2>/dev/null || true + fi + sudo ssh-keygen -A + sudo bash -c 'echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config' + sudo mkdir -p /etc/ssh/sshd_config.d + sudo bash -c 'cat >/etc/ssh/sshd_config.d/pxf-automation.conf </dev/null || true + fi + echo "gpadmin:cbdb@123" | sudo chpasswd + echo "gpadmin ALL=(ALL) NOPASSWD: ALL" | sudo tee -a /etc/sudoers >/dev/null + echo "root ALL=(ALL) NOPASSWD: ALL" | sudo tee -a /etc/sudoers >/dev/null + mkdir -p /home/gpadmin/.ssh + sudo chown -R gpadmin:gpadmin /home/gpadmin/.ssh + if [ ! -f /home/gpadmin/.ssh/id_rsa ]; then + sudo -u gpadmin ssh-keygen -q -t rsa -b 4096 -m PEM -C gpadmin -f /home/gpadmin/.ssh/id_rsa -N "" + fi + sudo -u gpadmin bash -lc 'cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys' + sudo -u gpadmin chmod 0600 /home/gpadmin/.ssh/authorized_keys + ssh-keyscan -t rsa mdw cdw localhost 2>/dev/null > /home/gpadmin/.ssh/known_hosts || true + sudo rm -rf /run/nologin + sudo mkdir -p /var/run/sshd && sudo chmod 0755 /var/run/sshd + id sshd &>/dev/null || sudo useradd -r -d /var/empty/sshd -s /sbin/nologin sshd 2>/dev/null || true + sudo mkdir -p /var/empty/sshd && sudo chmod 0755 /var/empty/sshd + sudo /usr/sbin/sshd -E /tmp/sshd.log || die "Failed to start sshd, check /tmp/sshd.log" + sleep 1 + if ! ss -tlnp | grep -q ':22 '; then + log "ERROR: sshd is not listening on port 22" + cat /tmp/sshd.log 2>/dev/null || true + sudo /usr/sbin/sshd -D -e & + sleep 1 + if ! ss -tlnp | grep -q ':22 '; then + die "sshd failed to bind to port 22" + fi + fi + log "sshd is running on port 22" +} + +start_cloudberry() { + log "starting Cloudberry cluster" + source /usr/local/cloudberry-db/cloudberry-env.sh + # Demo cluster cannot be pre-baked in Docker image (hostname mismatch + # between build-time 'buildkitsandbox' and runtime 'mdw'). + # Create it at first run; subsequent runs just gpstart. + if [ -f ~/workspace/cloudberry/gpAux/gpdemo/gpdemo-env.sh ]; then + source ~/workspace/cloudberry/gpAux/gpdemo/gpdemo-env.sh + gpstart -a || { + log "gpstart failed, re-creating demo cluster" + rm -rf ~/workspace/cloudberry/gpAux/gpdemo/datadirs + rm -f /tmp/.s.PGSQL.700* + make create-demo-cluster -C ~/workspace/cloudberry + source ~/workspace/cloudberry/gpAux/gpdemo/gpdemo-env.sh + } + else + log "demo cluster not found, creating..." + rm -f /tmp/.s.PGSQL.700* + make create-demo-cluster -C ~/workspace/cloudberry || { + log "create-demo-cluster failed, trying manual setup" + cd ~/workspace/cloudberry + ./configure --prefix=/usr/local/cloudberry-db --enable-debug --with-perl --with-python --with-libxml --enable-depend + make create-demo-cluster + } + source ~/workspace/cloudberry/gpAux/gpdemo/gpdemo-env.sh + fi + psql -P pager=off template1 -c 'SELECT * from gp_segment_configuration' || true + psql template1 -c 'SELECT version()' || true +} + +relax_pg_hba() { + local pg_hba=/home/gpadmin/workspace/cloudberry/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/pg_hba.conf + if [ -f "${pg_hba}" ] && ! grep -q "127.0.0.1/32 trust" "${pg_hba}"; then + cat >> "${pg_hba}" <<'EOF' +host all all 127.0.0.1/32 trust +host all all ::1/128 trust +EOF + source /usr/local/cloudberry-db/cloudberry-env.sh >/dev/null 2>&1 || true + gpstop -u || true + fi +} + +build_pxf() { + log "build PXF" + "${COMMON_SCRIPTS}/build_pxf.sh" +} + +configure_pxf() { + log "configure PXF" + source "${COMMON_SCRIPTS}/pxf-env.sh" + export PATH="$PXF_HOME/bin:$PATH" + export PXF_JVM_OPTS="-Xmx512m -Xms256m -Duser.timezone=UTC" + export PXF_HOST=localhost + # Persist settings into pxf-env.sh so they survive `pxf restart` + cat >> "$PXF_BASE/conf/pxf-env.sh" <> "$PXF_BASE/conf/pxf-application.properties" + cp -v "$PXF_HOME"/templates/{hdfs,mapred,yarn,core,hbase,hive}-site.xml "$PXF_BASE/servers/default" + for server_dir in "$PXF_BASE/servers/default" "$PXF_BASE/servers/default-no-impersonation"; do + if [ ! -d "$server_dir" ]; then + cp -r "$PXF_BASE/servers/default" "$server_dir" + fi + if [ ! -f "$server_dir/pxf-site.xml" ]; then + cat > "$server_dir/pxf-site.xml" <<'XML' + + + +XML + fi + done + if ! grep -q "pxf.service.user.name" "$PXF_BASE/servers/default-no-impersonation/pxf-site.xml"; then + sed -i 's## \n pxf.service.user.name\n foobar\n \n \n pxf.service.user.impersonation\n false\n \n#' "$PXF_BASE/servers/default-no-impersonation/pxf-site.xml" + fi + + # PXF profiles + cat > "$PXF_BASE/conf/pxf-profiles.xml" <<'EOF' + + + + pxf:parquet + Profile for reading and writing Parquet files + + org.apache.cloudberry.pxf.plugins.hdfs.HdfsDataFragmenter + org.apache.cloudberry.pxf.plugins.hdfs.ParquetFileAccessor + org.apache.cloudberry.pxf.plugins.hdfs.ParquetResolver + + + + test:text + Test profile for text files + + org.apache.cloudberry.pxf.plugins.hdfs.HdfsDataFragmenter + org.apache.cloudberry.pxf.plugins.hdfs.LineBreakAccessor + org.apache.cloudberry.pxf.plugins.hdfs.StringPassResolver + + + +EOF + cp "$PXF_BASE/conf/pxf-profiles.xml" "$PXF_HOME/conf/pxf-profiles.xml" + + # S3/MinIO configuration + mkdir -p "$PXF_BASE/servers/s3" "$PXF_HOME/servers/s3" + for s3_site in "$PXF_BASE/servers/s3/s3-site.xml" "$PXF_BASE/servers/default/s3-site.xml" "$PXF_HOME/servers/s3/s3-site.xml"; do + mkdir -p "$(dirname "$s3_site")" + cat > "$s3_site" <<'EOF' + + + fs.s3a.endpointhttp://localhost:9000 + fs.s3a.access.keyadmin + fs.s3a.secret.keypassword + fs.s3a.path.style.accesstrue + fs.s3a.connection.ssl.enabledfalse + fs.s3a.implorg.apache.hadoop.fs.s3a.S3AFileSystem + fs.s3a.aws.credentials.providerorg.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider + +EOF + done + mkdir -p /home/gpadmin/.aws/ + cat > "/home/gpadmin/.aws/credentials" <<'EOF' +[default] +aws_access_key_id = admin +aws_secret_access_key = password +EOF +} + +prepare_hadoop_stack() { + log "prepare Hadoop/Hive/HBase stack" + export JAVA_HOME="${JAVA_HADOOP}" + export PATH="$JAVA_HOME/bin:$HADOOP_ROOT/bin:$HIVE_ROOT/bin:$PATH" + source "${GPHD_ROOT}/bin/gphd-env.sh" + cd "${REPO_DIR}/automation" + make symlink_pxf_jars + cp /home/gpadmin/automation_tmp_lib/pxf-hbase.jar "$GPHD_ROOT/hbase/lib/" || true + if [ ! -f "${GPHD_ROOT}/hbase/lib/pxf-hbase.jar" ]; then + pxf_app=$(ls -1v /usr/local/pxf/application/pxf-app-*.jar | grep -v 'plain' | tail -n 1) + unzip -qq -j "${pxf_app}" 'BOOT-INF/lib/pxf-hbase-*.jar' -d "${GPHD_ROOT}/hbase/lib/" + fi + rm -f "${GPHD_ROOT}/storage/hive/metastore_db/"*.lck 2>/dev/null || true + rm -f "${GPHD_ROOT}/storage/pids"/hive-*.pid 2>/dev/null || true + + # Namenode already formatted in image; just ensure ports are free and start + log "ensuring DataNode ports are free..." + for port in 50010 50020 50075 50080; do + fuser -k ${port}/tcp 2>/dev/null || true + done + sleep 1 + log "starting HDFS/YARN/HBase via start-gphd.sh..." + if ! ${GPHD_ROOT}/bin/start-gphd.sh 2>&1; then + log "start-gphd.sh returned non-zero, continue" + fi + # Reuse wait_for_datanode from entrypoint.sh via sourced utils or inline + log "waiting for HDFS DataNode..." + for _try in $(seq 1 45); do + if hdfs dfsadmin -report 2>/dev/null | grep -q "Live datanodes.*[1-9]"; then + log "HDFS DataNode is available" + break + fi + sleep 2 + done + if ! ${GPHD_ROOT}/bin/start-zookeeper.sh; then + log "start-zookeeper.sh returned non-zero" + fi + if ! ${GPHD_ROOT}/bin/start-hbase.sh; then + log "start-hbase.sh returned non-zero" + fi + # Wait for HBase RegionServer + for _i in $(seq 1 60); do + if pgrep -f HRegionServer >/dev/null 2>&1; then + log "HBase RegionServer is running" + break + fi + sleep 1 + done + start_hive_services +} + +start_hive_services() { + log "start Hive metastore and HiveServer2 (NOSASL)" + export JAVA_HOME="${JAVA_HADOOP}" + export PATH="${JAVA_HOME}/bin:${HIVE_ROOT}/bin:${HADOOP_ROOT}/bin:${PATH}" + export HIVE_HOME="${HIVE_ROOT}" + export HADOOP_HOME="${HADOOP_ROOT}" + local tez_root="${TEZ_ROOT:-${GPHD_ROOT}/tez}" + export HADOOP_HEAPSIZE=${HADOOP_HEAPSIZE:-1024} + export HADOOP_CLIENT_OPTS="-Xmx${HADOOP_HEAPSIZE}m -Xms512m ${HADOOP_CLIENT_OPTS:-}" + + "${HADOOP_ROOT}/bin/hadoop" fs -mkdir -p /apps/tez + "${HADOOP_ROOT}/bin/hadoop" fs -copyFromLocal -f "${tez_root}"/* /apps/tez + + pkill -f HiveServer2 || true + pkill -f HiveMetaStore || true + rm -rf "${GPHD_ROOT}/storage/hive/metastore_db" 2>/dev/null || true + rm -f "${GPHD_ROOT}/storage/logs/derby.log" 2>/dev/null || true + rm -f "${GPHD_ROOT}/storage/pids"/hive-*.pid 2>/dev/null || true + + if ! PATH="${HIVE_ROOT}/bin:${HADOOP_ROOT}/bin:${PATH}" \ + JAVA_HOME="${JAVA_HADOOP}" \ + schematool -dbType derby -initSchema -verbose; then + rm -rf "${GPHD_ROOT}/storage/hive/metastore_db" 2>/dev/null || true + PATH="${HIVE_ROOT}/bin:${HADOOP_ROOT}/bin:${PATH}" \ + JAVA_HOME="${JAVA_HADOOP}" \ + schematool -dbType derby -initSchema -verbose || die "schematool initSchema failed" + fi + + HIVE_OPTS="--hiveconf javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=${GPHD_ROOT}/storage/hive/metastore_db;create=true" \ + "${GPHD_ROOT}/bin/hive-service.sh" metastore start + + local ok=false + for _ in 1 2 3 4 5 6 7 8 9 10; do + if bash -c ">/dev/tcp/localhost/9083" >/dev/null 2>&1; then ok=true; break; fi + sleep 2 + done + [ "${ok}" != "true" ] && die "Hive metastore not reachable on 9083" + + HIVE_OPTS="--hiveconf hive.server2.authentication=NOSASL --hiveconf hive.metastore.uris=thrift://localhost:9083 --hiveconf javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=${GPHD_ROOT}/storage/hive/metastore_db;create=true" \ + "${GPHD_ROOT}/bin/hive-service.sh" hiveserver2 start + + log "waiting for HiveServer2 on port 10000..." + for i in {1..60}; do + if ss -ln | grep -q ":10000 " || lsof -i :10000 >/dev/null 2>&1; then + if echo "SHOW DATABASES;" | beeline -u "jdbc:hive2://localhost:10000/default" --silent=true >/dev/null 2>&1; then + log "HiveServer2 is ready" + break + fi + fi + [ $i -eq 60 ] && log "WARN: HiveServer2 may not be fully ready" + sleep 1 + done +} + +deploy_minio() { + log "deploying MinIO" + bash "${COMMON_SCRIPTS}/start_minio.bash" +} + +main() { + detect_java_paths + setup_ssh + start_cloudberry + relax_pg_hba + build_pxf + configure_pxf + prepare_hadoop_stack + deploy_minio + health_check + log "entrypoint_fast finished; environment ready for tests" +} + +main "$@" diff --git a/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh b/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh index 63b99352..230222c1 100755 --- a/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh +++ b/ci/docker/pxf-cbdb-dev/common/script/run_tests.sh @@ -20,6 +20,9 @@ # -------------------------------------------------------------------- set -euo pipefail +# Ensure UTC timezone (see entrypoint.sh for rationale) +export TZ=UTC + # Run automation tests only (assumes build/env already prepared) # Use a unique var name to avoid clobbering by sourced env scripts @@ -90,6 +93,28 @@ health_check_with_retry() { fi } +mvn_with_retry() { + local max_attempts=3 + for attempt in $(seq 1 ${max_attempts}); do + if mvn "$@"; then + return 0 + fi + if [ "${attempt}" -lt "${max_attempts}" ]; then + echo "[run_tests] Maven failed (attempt ${attempt}/${max_attempts}), retrying in 10s..." + sleep 10 + fi + done + echo "[run_tests] Maven failed after ${max_attempts} attempts" + return 1 +} + +resolve_maven_dependencies() { + echo "[run_tests] Pre-resolving Maven dependencies..." + pushd "${REPO_ROOT}/automation" >/dev/null + mvn_with_retry -B -q dependency:resolve -DskipTests 2>&1 || echo "[warn] Maven dependency resolution failed, tests may fail" + popd >/dev/null +} + cleanup_hdfs_test_data() { hdfs dfs -rm -r -f /gpdb-ud-scratch/tmp/pxf_automation_data >/dev/null 2>&1 || true } @@ -526,7 +551,7 @@ ensure_testplugin_jar() { export PXF_HOME=${PXF_HOME:-/usr/local/pxf} if [ ! -f "${PXF_BASE}/lib/pxf-automation-test.jar" ]; then pushd "${REPO_ROOT}/automation" >/dev/null - mvn -q -DskipTests test-compile + mvn_with_retry -q -DskipTests test-compile jar cf "${PXF_BASE}/lib/pxf-automation-test.jar" -C target/classes org/apache/cloudberry/pxf/automation/testplugin popd >/dev/null JAVA_HOME="${JAVA_BUILD}" "${PXF_HOME}/bin/pxf" restart >/dev/null || true @@ -853,10 +878,13 @@ generate_test_summary() { run_single_group() { local group="$1" echo "[run_tests] Running single test group: $group" - + + # Pre-resolve Maven dependencies with retry for transient network failures + resolve_maven_dependencies + # Run health check first health_check_with_retry - + ensure_testuser_pg_hba export PGHOST=127.0.0.1 export PATH="${GPHOME}/bin:${PATH}" diff --git a/ci/docker/pxf-cbdb-dev/common/script/utils.sh b/ci/docker/pxf-cbdb-dev/common/script/utils.sh index c055dd25..44755bfd 100755 --- a/ci/docker/pxf-cbdb-dev/common/script/utils.sh +++ b/ci/docker/pxf-cbdb-dev/common/script/utils.sh @@ -45,19 +45,23 @@ check_jvm_procs() { fi echo "$jps_out" echo "$jps_out" | grep -q NameNode || die "NameNode not running" - echo "$jps_out" | grep -q DataNode || die "DataNode not running" + echo "$jps_out" | grep -q DataNode || log "WARN: DataNode not running (may still be registering)" } check_hbase() { local hbase_host="${HBASE_HOST:-$(hostname -I | awk '{print $1}')}" hbase_host=${hbase_host:-127.0.0.1} + # HBase checks are non-fatal: test groups that need HBase will fail with + # clear test errors; groups that don't need HBase should not be blocked. if ! echo "$jps_out" | grep -q HMaster && ! pgrep -f HMaster >/dev/null 2>&1; then - die "HBase HMaster not running" + log "WARN: HBase HMaster not running" + return 0 fi if ! echo "$jps_out" | grep -q HRegionServer && ! pgrep -f HRegionServer >/dev/null 2>&1; then - die "HBase RegionServer not running" + log "WARN: HBase RegionServer not running" + return 0 fi local hbase_ok=true @@ -69,7 +73,7 @@ check_hbase() { fi if [ "${hbase_ok}" != "true" ]; then [ -f /tmp/hbase_status.log ] && cat /tmp/hbase_status.log - die "HBase health check failed (status or port 16000 on ${hbase_host})" + log "WARN: HBase health check failed (status or port 16000 on ${hbase_host})" fi } diff --git a/ci/docker/pxf-cbdb-dev/rocky9/Dockerfile.test-ready b/ci/docker/pxf-cbdb-dev/rocky9/Dockerfile.test-ready new file mode 100644 index 00000000..0c94731d --- /dev/null +++ b/ci/docker/pxf-cbdb-dev/rocky9/Dockerfile.test-ready @@ -0,0 +1,118 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Test-ready image for Rocky 9: pre-bakes all static CI setup so each +# test job only needs to compile PXF and start services (~8 min vs ~25 min). +# +# Build context must contain: +# cloudberry-package/*.rpm -- Cloudberry RPM from build-cloudberry-rpm job +# cloudberry-source/ -- Cloudberry source tree (for make create-demo-cluster) +# -------------------------------------------------------------------- +FROM pxf/singlecluster-rocky9:3 + +USER root + +ENV TZ=UTC +ENV LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8 + +# ---- setup_locale_and_packages() + install_build_deps() ---- +RUN for repo in hpc-common; do \ + dnf config-manager --set-disabled "$repo" 2>/dev/null || true; \ + done && \ + dnf install -y wget maven unzip openssh-server iproute sudo psmisc \ + java-11-openjdk-headless java-1.8.0-openjdk-headless java-11-openjdk-devel \ + glibc-langpack-en glibc-locale-source \ + sudo git bison bzip2 cmake curl flex gcc gcc-c++ iputils \ + apr-devel bzip2-devel libcurl-devel libevent-devel \ + krb5-devel perl-IPC-Run openldap-devel pam-devel protobuf-devel readline-devel \ + openssl-devel libuv-devel lz4-devel libxml2-devel libyaml-devel \ + libzstd-devel perl-devel make pkgconfig protobuf-compiler python3-devel python3-pip \ + python3-setuptools rsync snappy-devel && \ + (localedef -c -i en_US -f UTF-8 en_US.UTF-8 || true) && \ + (localedef -c -i ru_RU -f UTF-8 ru_RU.UTF-8 || true) && \ + (localedef -c -i ru_RU -f CP1251 ru_RU.CP1251 || true) && \ + dnf clean all + +# ---- setup_ssh() static parts + LEGACY crypto policy ---- +RUN if command -v update-crypto-policies >/dev/null 2>&1; then \ + update-crypto-policies --set LEGACY 2>/dev/null || true; \ + fi && \ + ssh-keygen -A && \ + echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \ + mkdir -p /etc/ssh/sshd_config.d && \ + cat >/etc/ssh/sshd_config.d/pxf-automation.conf <<'SSHEOF' +KexAlgorithms +diffie-hellman-group-exchange-sha1,diffie-hellman-group14-sha1,diffie-hellman-group1-sha1 +HostKeyAlgorithms +ssh-rsa,ssh-dss +PubkeyAcceptedAlgorithms +ssh-rsa,ssh-dss +SSHEOF + +RUN usermod -a -G wheel gpadmin 2>/dev/null || true && \ + echo "gpadmin:cbdb@123" | chpasswd && \ + echo "gpadmin ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + echo "root ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + mkdir -p /home/gpadmin/.ssh && \ + chown -R gpadmin:gpadmin /home/gpadmin/.ssh && \ + sudo -u gpadmin ssh-keygen -q -t rsa -b 4096 -m PEM -C gpadmin -f /home/gpadmin/.ssh/id_rsa -N "" && \ + cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys && \ + chmod 0600 /home/gpadmin/.ssh/authorized_keys && \ + rm -rf /run/nologin && \ + mkdir -p /var/run/sshd && chmod 0755 /var/run/sshd && \ + id sshd || useradd -r -d /var/empty/sshd -s /sbin/nologin sshd 2>/dev/null || true && \ + mkdir -p /var/empty/sshd && chmod 0755 /var/empty/sshd + +# ---- System limits ---- +RUN tee /etc/security/limits.d/90-db-limits.conf <<'EOF' +gpadmin soft core unlimited +gpadmin hard core unlimited +gpadmin soft nofile 524288 +gpadmin hard nofile 524288 +gpadmin soft nproc 131072 +gpadmin hard nproc 131072 +EOF + +# ---- Install Cloudberry from package ---- +COPY cloudberry-package/ /tmp/cloudberry-package/ +RUN pkg=$(find /tmp/cloudberry-package -name "apache-cloudberry-db*.rpm" | head -1) && \ + rm -rf /usr/local/cloudberry-db && \ + chmod a+w /usr/local && \ + mkdir -p /usr/local/cloudberry-db && \ + chown -R gpadmin:gpadmin /usr/local/cloudberry-db && \ + (rpm -Uvh --force "$pkg" || dnf install -y "$pkg") && \ + rm -rf /tmp/cloudberry-package && \ + echo -e '\n# Add Cloudberry entries\nif [ -f /usr/local/cloudberry-db/cloudberry-env.sh ]; then\n source /usr/local/cloudberry-db/cloudberry-env.sh\nfi\nexport LANG=en_US.UTF-8\n' >> /home/gpadmin/.bashrc + +# ---- Copy Cloudberry source (demo cluster created at runtime due to hostname) ---- +COPY cloudberry-source/ /home/gpadmin/workspace/cloudberry/ +RUN chown -R gpadmin:gpadmin /home/gpadmin/workspace/cloudberry + +# ---- HDFS namenode pre-format ---- +RUN sudo -u gpadmin bash -c '\ + export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk && \ + source /home/gpadmin/workspace/singlecluster/bin/gphd-env.sh && \ + /home/gpadmin/workspace/singlecluster/bin/init-gphd.sh' || true + +# ---- Pre-create PXF directories ---- +RUN mkdir -p /usr/local/pxf /home/gpadmin/pxf-base && \ + chown -R gpadmin:gpadmin /usr/local/pxf /home/gpadmin/pxf-base + +# Mark as test-ready image +RUN touch /etc/pxf-test-ready + +USER gpadmin +WORKDIR /home/gpadmin diff --git a/ci/docker/pxf-cbdb-dev/rocky9/docker-compose.yml b/ci/docker/pxf-cbdb-dev/rocky9/docker-compose.yml index 37738078..64ade52d 100644 --- a/ci/docker/pxf-cbdb-dev/rocky9/docker-compose.yml +++ b/ci/docker/pxf-cbdb-dev/rocky9/docker-compose.yml @@ -20,16 +20,12 @@ services: # hadoop singlecluster: - build: - context: ../../../singlecluster - args: - BASE_IMAGE: apache/incubator-cloudberry:cbdb-build-rocky9-latest - image: pxf/singlecluster-rocky9:3 + image: pxf/test-ready-rocky9:latest container_name: pxf_singlecluster_rocky9 hostname: cdw pxf-cbdb-dev: - image: pxf/singlecluster-rocky9:3 + image: pxf/test-ready-rocky9:latest container_name: pxf-cbdb-dev hostname: mdw depends_on: @@ -38,7 +34,6 @@ services: - "2222:22" volumes: - ../../../../../cloudberry-pxf:/home/gpadmin/workspace/cloudberry-pxf - - ../../../../../cloudberry:/home/gpadmin/workspace/cloudberry command: ["tail", "-f", "/dev/null"] networks: diff --git a/ci/docker/pxf-cbdb-dev/ubuntu/Dockerfile.test-ready b/ci/docker/pxf-cbdb-dev/ubuntu/Dockerfile.test-ready new file mode 100644 index 00000000..1b112109 --- /dev/null +++ b/ci/docker/pxf-cbdb-dev/ubuntu/Dockerfile.test-ready @@ -0,0 +1,115 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Test-ready image for Ubuntu: pre-bakes all static CI setup so each +# test job only needs to compile PXF and start services (~8 min vs ~25 min). +# +# Build context must contain: +# cloudberry-package/*.deb -- Cloudberry DEB from build-cloudberry-deb job +# cloudberry-source/ -- Cloudberry source tree (for make create-demo-cluster) +# -------------------------------------------------------------------- +FROM pxf/singlecluster:3 + +USER root + +ENV TZ=UTC +ENV LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8 +ENV DEBIAN_FRONTEND=noninteractive + +# ---- setup_locale_and_packages() + install_build_deps() ---- +# Single RUN to avoid apt cache invalidation between layers +RUN apt-get update && \ + apt-get install -y \ + wget lsb-release locales maven unzip openssh-server iproute2 sudo psmisc \ + openjdk-11-jre-headless openjdk-8-jre-headless openjdk-11-jdk \ + sudo git bison bzip2 cmake curl flex gcc g++ iputils-ping \ + language-pack-en libapr1-dev libbz2-dev libcurl4-gnutls-dev libevent-dev \ + libkrb5-dev libipc-run-perl libldap2-dev libpam0g-dev libprotobuf-dev libreadline-dev \ + libssl-dev libuv1-dev liblz4-dev libxml2-dev libyaml-dev libzstd-dev \ + libperl-dev make pkg-config protobuf-compiler python3-dev python3-pip python3-setuptools \ + rsync libsnappy-dev && \ + (apt-get install -y libxerces-c-dev || true) && \ + locale-gen en_US.UTF-8 ru_RU.CP1251 ru_RU.UTF-8 && \ + update-locale LANG=en_US.UTF-8 && \ + (localedef -c -i ru_RU -f CP1251 ru_RU.CP1251 || true) && \ + rm -rf /var/lib/apt/lists/* + +# ---- setup_ssh() static parts (sshd started at runtime) ---- +RUN ssh-keygen -A && \ + echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config && \ + mkdir -p /etc/ssh/sshd_config.d && \ + cat >/etc/ssh/sshd_config.d/pxf-automation.conf <<'SSHEOF' +KexAlgorithms +diffie-hellman-group-exchange-sha1,diffie-hellman-group14-sha1,diffie-hellman-group1-sha1 +HostKeyAlgorithms +ssh-rsa,ssh-dss +PubkeyAcceptedAlgorithms +ssh-rsa,ssh-dss +SSHEOF + +RUN usermod -a -G sudo gpadmin && \ + echo "gpadmin:cbdb@123" | chpasswd && \ + echo "gpadmin ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + echo "root ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + mkdir -p /home/gpadmin/.ssh && \ + chown -R gpadmin:gpadmin /home/gpadmin/.ssh && \ + sudo -u gpadmin ssh-keygen -q -t rsa -b 4096 -m PEM -C gpadmin -f /home/gpadmin/.ssh/id_rsa -N "" && \ + cat /home/gpadmin/.ssh/id_rsa.pub >> /home/gpadmin/.ssh/authorized_keys && \ + chmod 0600 /home/gpadmin/.ssh/authorized_keys && \ + rm -rf /run/nologin && \ + mkdir -p /var/run/sshd && chmod 0755 /var/run/sshd && \ + mkdir -p /var/empty/sshd && chmod 0755 /var/empty/sshd + +# ---- System limits ---- +RUN tee /etc/security/limits.d/90-db-limits.conf <<'EOF' +gpadmin soft core unlimited +gpadmin hard core unlimited +gpadmin soft nofile 524288 +gpadmin hard nofile 524288 +gpadmin soft nproc 131072 +gpadmin hard nproc 131072 +EOF + +# ---- Install Cloudberry from package ---- +COPY cloudberry-package/ /tmp/cloudberry-package/ +RUN pkg=$(find /tmp/cloudberry-package -name "apache-cloudberry-db*.deb" | head -1) && \ + rm -rf /usr/local/cloudberry-db && \ + chmod a+w /usr/local && \ + mkdir -p /usr/local/cloudberry-db && \ + chown -R gpadmin:gpadmin /usr/local/cloudberry-db && \ + dpkg -i "$pkg" || apt-get install -f -y && \ + rm -rf /tmp/cloudberry-package && \ + echo '\n# Add Cloudberry entries\nif [ -f /usr/local/cloudberry-db/cloudberry-env.sh ]; then\n source /usr/local/cloudberry-db/cloudberry-env.sh\nfi\nexport LANG=en_US.UTF-8\n' >> /home/gpadmin/.bashrc + +# ---- Copy Cloudberry source (demo cluster created at runtime due to hostname) ---- +COPY cloudberry-source/ /home/gpadmin/workspace/cloudberry/ +RUN chown -R gpadmin:gpadmin /home/gpadmin/workspace/cloudberry + +# ---- HDFS namenode pre-format ---- +RUN sudo -u gpadmin bash -c '\ + export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 && \ + source /home/gpadmin/workspace/singlecluster/bin/gphd-env.sh && \ + /home/gpadmin/workspace/singlecluster/bin/init-gphd.sh' || true + +# ---- Pre-create PXF directories ---- +RUN mkdir -p /usr/local/pxf /home/gpadmin/pxf-base && \ + chown -R gpadmin:gpadmin /usr/local/pxf /home/gpadmin/pxf-base + +# Mark as test-ready image +RUN touch /etc/pxf-test-ready + +USER gpadmin +WORKDIR /home/gpadmin diff --git a/ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml b/ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml index 02519cb9..8950091b 100644 --- a/ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml +++ b/ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml @@ -20,15 +20,12 @@ services: # hadoop singlecluster: - build: - dockerfile: Dockerfile - context: ../../../singlecluster - image: pxf/singlecluster:3 + image: pxf/test-ready-ubuntu:latest container_name: pxf_singlecluster hostname: cdw pxf-cbdb-dev: - image: pxf/singlecluster:3 + image: pxf/test-ready-ubuntu:latest container_name: pxf-cbdb-dev hostname: mdw depends_on: @@ -37,7 +34,6 @@ services: - "2222:22" volumes: - ../../../../../cloudberry-pxf:/home/gpadmin/workspace/cloudberry-pxf - - ../../../../../cloudberry:/home/gpadmin/workspace/cloudberry command: ["tail", "-f", "/dev/null"] networks: diff --git a/ci/singlecluster/Dockerfile b/ci/singlecluster/Dockerfile index 4d6bb655..c61deef4 100644 --- a/ci/singlecluster/Dockerfile +++ b/ci/singlecluster/Dockerfile @@ -50,16 +50,8 @@ ENV ZOOKEEPER_SHA512="0e5a64713abc6f36d961dd61a06f681868171a9d9228366e512a013248 ENV HBASE_SHA512="1032521025660daa70260cdc931f52a26c87596be444451fe1fa88b526ede55e9d6b4220e91ff6f7422bec11f30d64fa6745e95a9c36971fdb1a264a2c745693" ENV TEZ_SHA512="a2d94bd9fa778d42a8bac9d9da8e263e469ddfef93968b06434716554995f490231de5607541ac236e770aa0158b64250c38bc1cd57dbfa629fea705f2ffa2f5" -# faster mirror: -ENV APACHE_MIRROR="repo.huaweicloud.com/apache" -#ENV APACHE_MIRROR="archive.apache.org/dist/" -#ENV APACHE_MIRROR="mirror.yandex.ru/mirrors/apache/" - -ENV HADOOP_URL="https://$APACHE_MIRROR/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" -ENV HIVE_URL="https://$APACHE_MIRROR/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz" -ENV ZOOKEEPER_URL="https://$APACHE_MIRROR/zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz" -ENV HBASE_URL="https://$APACHE_MIRROR/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz" -ENV TEZ_URL="https://$APACHE_MIRROR/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz" +# Mirror list: try fast mirrors first, fall back to official archive +ENV APACHE_MIRRORS="dlcdn.apache.org archive.apache.org/dist" ENV GPHD_ROOT=/home/gpadmin/workspace/singlecluster ENV HADOOP_ROOT=$GPHD_ROOT/hadoop @@ -68,34 +60,54 @@ ENV HIVE_ROOT=$GPHD_ROOT/hive ENV ZOOKEEPER_ROOT=$GPHD_ROOT/zookeeper ENV TEZ_ROOT=$GPHD_ROOT/tez +# Helper: download from first working mirror with retry +# Usage: apache_download +RUN sudo tee /usr/local/bin/apache_download.sh > /dev/null <<'DLEOF' && sudo chmod +x /usr/local/bin/apache_download.sh +#!/bin/bash +set -e +rel_path="$1"; output="$2" +for mirror in $APACHE_MIRRORS; do + url="https://${mirror}/${rel_path}" + echo "Trying: $url" + if curl -fSL --retry 2 --retry-delay 3 --connect-timeout 15 "$url" -o "$output" 2>&1; then + echo "Downloaded from $mirror" + exit 0 + fi + echo "Failed from $mirror, trying next..." + rm -f "$output" +done +echo "ERROR: all mirrors failed for $rel_path" +exit 1 +DLEOF + RUN mkdir -p $HADOOP_ROOT && \ - curl -fSL "$HADOOP_URL" -o /tmp/hadoop.tar.gz && \ + apache_download.sh "hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" /tmp/hadoop.tar.gz && \ echo "$HADOOP_SHA512 /tmp/hadoop.tar.gz" | sha512sum -c && \ tar xvf /tmp/hadoop.tar.gz -C $HADOOP_ROOT --strip-components 1 --exclude="share/doc/*" --exclude="*-sources.jar" && \ rm /tmp/hadoop.tar.gz && \ - curl -fSL "https://repo1.maven.org/maven2/javax/activation/javax.activation-api/1.2.0/javax.activation-api-1.2.0.jar" \ + curl -fSL --retry 2 "https://repo1.maven.org/maven2/javax/activation/javax.activation-api/1.2.0/javax.activation-api-1.2.0.jar" \ -o $HADOOP_ROOT/share/hadoop/common/lib/javax.activation-api-1.2.0.jar RUN mkdir -p $HIVE_ROOT && \ - curl -fSL $HIVE_URL -o /tmp/hive.tar.gz && \ + apache_download.sh "hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz" /tmp/hive.tar.gz && \ echo "$HIVE_SHA256 /tmp/hive.tar.gz" | sha256sum -c && \ tar xvf /tmp/hive.tar.gz -C $HIVE_ROOT --strip-components 1 && \ rm /tmp/hive.tar.gz RUN mkdir -p $ZOOKEEPER_ROOT && \ - curl -fSL $ZOOKEEPER_URL -o /tmp/zookeeper.tar.gz && \ + apache_download.sh "zookeeper/zookeeper-$ZOOKEEPER_VERSION/apache-zookeeper-$ZOOKEEPER_VERSION-bin.tar.gz" /tmp/zookeeper.tar.gz && \ echo "$ZOOKEEPER_SHA512 /tmp/zookeeper.tar.gz" | sha512sum -c && \ tar xvf /tmp/zookeeper.tar.gz -C $ZOOKEEPER_ROOT --strip-components 1 --exclude="docs/*" && \ rm /tmp/zookeeper.tar.gz RUN mkdir -p $HBASE_ROOT && \ - curl -fSL "$HBASE_URL" -o /tmp/hbase.tar.gz && \ + apache_download.sh "hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz" /tmp/hbase.tar.gz && \ echo "$HBASE_SHA512 /tmp/hbase.tar.gz" | sha512sum -c && \ tar xvf /tmp/hbase.tar.gz -C $HBASE_ROOT --strip-components 1 --exclude="docs/*" --exclude="lib/*-tests.jar" --exclude="lib/shaded-clients" && \ rm /tmp/hbase.tar.gz RUN mkdir -p $TEZ_ROOT && \ - curl -fSL "$TEZ_URL" -o /tmp/tez.tar.gz && \ + apache_download.sh "tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz" /tmp/tez.tar.gz && \ echo "$TEZ_SHA512 /tmp/tez.tar.gz" | sha512sum -c && \ tar xvf /tmp/tez.tar.gz -C $TEZ_ROOT --strip-components 1 && \ rm /tmp/tez.tar.gz diff --git a/server/gradlew-install.sh b/server/gradlew-install.sh index 510fa2ad..71dc0c70 100755 --- a/server/gradlew-install.sh +++ b/server/gradlew-install.sh @@ -58,13 +58,23 @@ if [ ! -e "${GRADLE_WRAPPER_JAR}" ]; then # The Gradle version extracted from the `distributionUrl` property does not contain ".0" patch # versions. Need to append a ".0" in that case to download the wrapper jar. GRADLE_VERSION="$(echo "$GRADLE_DIST_VERSION" | sed 's/^\([0-9]*[.][0-9]*\)$/\1.0/')" - curl --location --output "${GRADLE_WRAPPER_JAR}" https://raw.githubusercontent.com/gradle/gradle/v${GRADLE_VERSION}/gradle/wrapper/gradle-wrapper.jar || exit 1 - JAR_CHECKSUM="$(${SHASUM} "${GRADLE_WRAPPER_JAR}" | cut -d\ -f1)" EXPECTED="$(cat "${GRADLE_WRAPPER_SHA256}")" - if [ "${JAR_CHECKSUM}" != "${EXPECTED}" ]; then - # If the (just downloaded) checksum and the downloaded wrapper jar do not match, something - # really bad is going on. + MAX_RETRIES=3 + for _retry in $(seq 1 ${MAX_RETRIES}); do + curl --location --fail --output "${GRADLE_WRAPPER_JAR}" https://raw.githubusercontent.com/gradle/gradle/v${GRADLE_VERSION}/gradle/wrapper/gradle-wrapper.jar || { + echo "Download attempt ${_retry}/${MAX_RETRIES} failed (curl error)" > /dev/stderr + rm -f "${GRADLE_WRAPPER_JAR}" + if [ "${_retry}" -lt "${MAX_RETRIES}" ]; then sleep 5; continue; fi + exit 1 + } + JAR_CHECKSUM="$(${SHASUM} "${GRADLE_WRAPPER_JAR}" | cut -d\ -f1)" + if [ "${JAR_CHECKSUM}" = "${EXPECTED}" ]; then + break + fi + echo "SHA256 mismatch on attempt ${_retry}/${MAX_RETRIES} (got ${JAR_CHECKSUM}, expected ${EXPECTED})" > /dev/stderr + rm -f "${GRADLE_WRAPPER_JAR}" + if [ "${_retry}" -lt "${MAX_RETRIES}" ]; then sleep 5; continue; fi echo "Expected sha256 of the downloaded gradle-wrapper.jar does not match the downloaded sha256!" > /dev/stderr exit 1 - fi + done fi