Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
e5bf595
fix ci
MisterRaindrop Mar 26, 2026
8acabec
fix
MisterRaindrop Mar 27, 2026
01d5b26
fix
MisterRaindrop Mar 27, 2026
52e04d2
fix
MisterRaindrop Mar 27, 2026
d53ca88
fix
MisterRaindrop Mar 27, 2026
a719511
fix: resolve Rocky 9 Parquet test failures and improve service stability
MisterRaindrop Mar 30, 2026
eb69809
fix: use correct HBase RegionServer port 60020 in wait_for_hbase
MisterRaindrop Mar 30, 2026
9c2c409
fix: simplify wait_for_hbase - remove broken /dev/tcp port check
MisterRaindrop Mar 30, 2026
a81a492
fix: preemptive port cleanup with fuser + fix pipefail bug in DataNod…
MisterRaindrop Mar 31, 2026
60c28e3
feat: add TestNG retry analyzer for transient CI test failures
MisterRaindrop Mar 31, 2026
9d19168
fix: use TestNG 6.x API getRetryAnalyzer() instead of 7.x getRetryAna…
MisterRaindrop Mar 31, 2026
3ec0a0b
fix: remove type assignment - TestNG 6.x getRetryAnalyzer() returns I…
MisterRaindrop Mar 31, 2026
fc74036
fix: register RetryListener via surefire config instead of @Listeners
MisterRaindrop Mar 31, 2026
951daee
fix: install psmisc package to provide fuser for DataNode port cleanup
MisterRaindrop Apr 1, 2026
6a7cddb
feat: enhance RetryAnalyzer with 3 retries and exponential backoff (3…
MisterRaindrop Apr 1, 2026
5c458fc
feat: cache singlecluster Docker image and use Apache official CDN
MisterRaindrop Apr 1, 2026
6d4be16
fix: persist TZ=UTC and PXF_JVM_OPTS into pxf-env.sh for pxf restart
MisterRaindrop Apr 2, 2026
4cc87f7
feat: pre-build test-ready Docker images to speed up CI
MisterRaindrop Apr 2, 2026
44d342b
fix: mkdir -p cloudberry-source before tar extract in build-test-image
MisterRaindrop Apr 2, 2026
8f0c8ef
fix: merge apt/dnf RUN layers to avoid cache invalidation in Dockerfi…
MisterRaindrop Apr 2, 2026
df0d447
fix: add USER root to Dockerfile.test-ready for apt/dnf permissions
MisterRaindrop Apr 2, 2026
4c43a72
fix: strip cloudberry/ prefix when extracting source tar for Dockerfi…
MisterRaindrop Apr 2, 2026
daa01c4
fix: move demo cluster creation to runtime (Docker build hostname mis…
MisterRaindrop Apr 2, 2026
fdb4d1b
fix: rescan SSH host keys after sshd start for Ganymed SSH authentica…
MisterRaindrop Apr 2, 2026
07a59a2
fix: fully re-configure SSH at runtime (password, crypto-policy, sshd…
MisterRaindrop Apr 2, 2026
8babe36
fix: use full setup_ssh() from original entrypoint instead of pre-bak…
MisterRaindrop Apr 2, 2026
44f4f2e
fix: skip retry for multi-block write tests that accumulate data with…
MisterRaindrop Apr 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 131 additions & 60 deletions .github/workflows/pxf-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,15 @@ jobs:
with:
path: cloudberry-pxf

- name: Cache singlecluster image
id: cache-image
uses: actions/cache@v4
with:
path: /tmp/singlecluster-image.tar
key: singlecluster-ubuntu-${{ hashFiles('ci/singlecluster/**') }}

- name: Build singlecluster image
if: steps.cache-image.outputs.cache-hit != 'true'
run: |
cd cloudberry-pxf/ci/singlecluster
docker build -t pxf/singlecluster:3 .
Expand Down Expand Up @@ -185,7 +193,15 @@ jobs:
with:
path: cloudberry-pxf

- name: Cache singlecluster Rocky 9 image
id: cache-image-rocky9
uses: actions/cache@v4
with:
path: /tmp/singlecluster-rocky9-image.tar
key: singlecluster-rocky9-${{ hashFiles('ci/singlecluster/**') }}

- name: Build singlecluster Rocky 9 image
if: steps.cache-image-rocky9.outputs.cache-hit != 'true'
run: |
cd cloudberry-pxf/ci/singlecluster
docker build --build-arg BASE_IMAGE=apache/incubator-cloudberry:cbdb-build-rocky9-latest -t pxf/singlecluster-rocky9:3 .
Expand All @@ -198,10 +214,109 @@ jobs:
path: /tmp/singlecluster-rocky9-image.tar
retention-days: 1

# Stage 1.5: Build test-ready images (Cloudberry + packages pre-installed)
build-test-image:
name: Build Test-Ready Image (Ubuntu)
needs: [build-cloudberry-deb, build-docker-images]
runs-on: ubuntu-latest
steps:
- name: Checkout PXF source
uses: actions/checkout@v4
with:
path: cloudberry-pxf

- name: Download singlecluster image
uses: actions/download-artifact@v4
with:
name: singlecluster-image
path: /tmp

- name: Download Cloudberry DEB
uses: actions/download-artifact@v4
with:
name: cloudberry-deb
path: cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu/cloudberry-package/

- name: Download Cloudberry source
uses: actions/download-artifact@v4
with:
name: cloudberry-source
path: /tmp

- name: Prepare build context
run: |
docker load < /tmp/singlecluster-image.tar
mkdir -p cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu/cloudberry-source
tar xzf /tmp/cloudberry-source.tar.gz --strip-components=1 -C cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu/cloudberry-source/
ls cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu/cloudberry-package/
ls cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu/cloudberry-source/Makefile

- name: Build test-ready image
run: |
cd cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu
docker build -f Dockerfile.test-ready -t pxf/test-ready-ubuntu:latest .
docker save pxf/test-ready-ubuntu:latest > /tmp/test-ready-ubuntu.tar

- name: Upload test-ready image
uses: actions/upload-artifact@v4
with:
name: test-ready-ubuntu-image
path: /tmp/test-ready-ubuntu.tar
retention-days: 1

build-test-image-rocky9:
name: Build Test-Ready Image (Rocky 9)
needs: [build-cloudberry-rpm, build-docker-images-rocky9]
runs-on: ubuntu-latest
steps:
- name: Checkout PXF source
uses: actions/checkout@v4
with:
path: cloudberry-pxf

- name: Download singlecluster Rocky 9 image
uses: actions/download-artifact@v4
with:
name: singlecluster-rocky9-image
path: /tmp

- name: Download Cloudberry RPM
uses: actions/download-artifact@v4
with:
name: cloudberry-rpm
path: cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9/cloudberry-package/

- name: Download Cloudberry source (Rocky 9)
uses: actions/download-artifact@v4
with:
name: cloudberry-source-rocky9
path: /tmp

- name: Prepare build context
run: |
docker load < /tmp/singlecluster-rocky9-image.tar
mkdir -p cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9/cloudberry-source
tar xzf /tmp/cloudberry-source-rocky9.tar.gz --strip-components=1 -C cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9/cloudberry-source/
ls cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9/cloudberry-package/
ls cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9/cloudberry-source/Makefile

- name: Build test-ready image
run: |
cd cloudberry-pxf/ci/docker/pxf-cbdb-dev/rocky9
docker build -f Dockerfile.test-ready -t pxf/test-ready-rocky9:latest .
docker save pxf/test-ready-rocky9:latest > /tmp/test-ready-rocky9.tar

- name: Upload test-ready image
uses: actions/upload-artifact@v4
with:
name: test-ready-rocky9-image
path: /tmp/test-ready-rocky9.tar
retention-days: 1

# Stage 2: Parallel test jobs using matrix strategy
pxf-test:
name: Test PXF - ${{ matrix.test_group }}
needs: [build-cloudberry-deb, build-docker-images]
needs: [build-test-image]
runs-on: ubuntu-latest
strategy:
fail-fast: false
Expand Down Expand Up @@ -245,44 +360,22 @@ jobs:
path: cloudberry-pxf
submodules: true

- name: Download Cloudberry DEB
uses: actions/download-artifact@v4
with:
name: cloudberry-deb
path: /tmp

- name: Download Cloudberry source
uses: actions/download-artifact@v4
with:
name: cloudberry-source
path: /tmp

- name: Download singlecluster image
- name: Download test-ready image
uses: actions/download-artifact@v4
with:
name: singlecluster-image
name: test-ready-ubuntu-image
path: /tmp

- name: Load singlecluster image
run: |
docker load < /tmp/singlecluster-image.tar

- name: Prepare Cloudberry source
run: |
tar xzf /tmp/cloudberry-source.tar.gz
chmod -R u+rwX,go+rX cloudberry
- name: Load test-ready image
run: docker load < /tmp/test-ready-ubuntu.tar

- name: Start Services
id: start_services
run: |
cd cloudberry-pxf
docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml down -v || true
docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml build
docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml up -d
docker exec pxf-cbdb-dev sudo chown -R gpadmin:gpadmin /home/gpadmin/workspace/cloudberry
docker cp /tmp/*.deb pxf-cbdb-dev:/tmp/
docker exec pxf-cbdb-dev sudo chown gpadmin:gpadmin /tmp/*.deb
docker exec pxf-cbdb-dev bash -lc "cd /home/gpadmin/workspace/cloudberry-pxf && ./ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh"
docker exec pxf-cbdb-dev bash -lc "cd /home/gpadmin/workspace/cloudberry-pxf && ./ci/docker/pxf-cbdb-dev/common/script/entrypoint_fast.sh"

- name: Run Test - ${{ matrix.test_group }}
id: run_test
Expand Down Expand Up @@ -364,15 +457,15 @@ jobs:
FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}"
SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}"

if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
exit 1
fi

# Stage 2b: Rocky 9 parallel test jobs
pxf-test-rocky9:
name: Test PXF Rocky9 - ${{ matrix.test_group }}
needs: [build-cloudberry-rpm, build-docker-images-rocky9]
needs: [build-test-image-rocky9]
runs-on: ubuntu-latest
strategy:
fail-fast: false
Expand Down Expand Up @@ -417,44 +510,22 @@ jobs:
path: cloudberry-pxf
submodules: true

- name: Download Cloudberry RPM
uses: actions/download-artifact@v4
with:
name: cloudberry-rpm
path: /tmp

- name: Download Cloudberry source (Rocky 9)
uses: actions/download-artifact@v4
with:
name: cloudberry-source-rocky9
path: /tmp

- name: Download singlecluster Rocky 9 image
- name: Download test-ready Rocky 9 image
uses: actions/download-artifact@v4
with:
name: singlecluster-rocky9-image
name: test-ready-rocky9-image
path: /tmp

- name: Load singlecluster Rocky 9 image
run: |
docker load < /tmp/singlecluster-rocky9-image.tar

- name: Prepare Cloudberry source
run: |
tar xzf /tmp/cloudberry-source-rocky9.tar.gz
chmod -R u+rwX,go+rX cloudberry
- name: Load test-ready Rocky 9 image
run: docker load < /tmp/test-ready-rocky9.tar

- name: Start Services
id: start_services
run: |
cd cloudberry-pxf
docker compose -f ci/docker/pxf-cbdb-dev/rocky9/docker-compose.yml down -v || true
docker compose -f ci/docker/pxf-cbdb-dev/rocky9/docker-compose.yml build
docker compose -f ci/docker/pxf-cbdb-dev/rocky9/docker-compose.yml up -d
docker exec pxf-cbdb-dev sudo chown -R gpadmin:gpadmin /home/gpadmin/workspace/cloudberry
docker cp /tmp/*.rpm pxf-cbdb-dev:/tmp/
docker exec pxf-cbdb-dev sudo chown gpadmin:gpadmin /tmp/*.rpm
docker exec pxf-cbdb-dev bash -lc "cd /home/gpadmin/workspace/cloudberry-pxf && ./ci/docker/pxf-cbdb-dev/common/script/entrypoint.sh"
docker exec pxf-cbdb-dev bash -lc "cd /home/gpadmin/workspace/cloudberry-pxf && ./ci/docker/pxf-cbdb-dev/common/script/entrypoint_fast.sh"

- name: Run Test - ${{ matrix.test_group }}
id: run_test
Expand Down Expand Up @@ -536,8 +607,8 @@ jobs:
FAILED_COUNT="${{ steps.collect_artifacts.outputs.failed_count || 0 }}"
SKIPPED_COUNT="${{ steps.collect_artifacts.outputs.skipped_count || 0 }}"

if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
if [ "${{ steps.run_test.outcome }}" == "failure" ] || [ "${{ steps.run_test.outcome }}" == "skipped" ] || [ "$FAILED_COUNT" -gt 0 ]; then
echo "Test group ${{ matrix.test_group }} (Rocky 9) failed (outcome: ${{ steps.run_test.outcome }}, Failures: $FAILED_COUNT, Skipped: $SKIPPED_COUNT)"
exit 1
fi

Expand Down
6 changes: 6 additions & 0 deletions automation/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@
<argLine>-Xmx4096m</argLine>
<forkCount>1</forkCount>
<reuseForks>false</reuseForks>
<properties>
<property>
<name>listener</name>
<value>listeners.RetryListener</value>
</property>
</properties>
</configuration>
<executions>
<execution>
Expand Down
65 changes: 65 additions & 0 deletions automation/src/main/java/listeners/RetryAnalyzer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package listeners;

import org.testng.IRetryAnalyzer;
import org.testng.ITestResult;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;

/**
* Retries failed tests up to {@value MAX_RETRIES} times with exponential
* backoff to handle transient CI failures (e.g. HDFS timeouts on
* resource-constrained GitHub Actions runners).
*
* <p>Delay schedule: 3-8s, 6-16s, 12-32s (capped at 60s).
*
* <p>Tests that write data to HDFS without cleanup are excluded from retry
* because retrying would append duplicate data and cause row-count mismatches.
*/
public class RetryAnalyzer implements IRetryAnalyzer {

private static final int MAX_RETRIES = 3;
private static final int BASE_MIN_MS = 3000;
private static final int BASE_MAX_MS = 8000;
private static final int MAX_DELAY_MS = 60000;

/** Tests that accumulate data on retry — skip retrying these. */
private static final Set<String> NO_RETRY_TESTS = new HashSet<>(Arrays.asList(
"copyFromFileMultiBlockedDataNoCompression",
"copyFromFileMultiBlockedDataGZip",
"copyFromFileMultiBlockedDataBZip2"
));

private int retryCount = 0;
private final Random random = new Random();

@Override
public boolean retry(ITestResult result) {
String methodName = result.getMethod().getMethodName();
if (NO_RETRY_TESTS.contains(methodName)) {
System.out.println("[RetryAnalyzer] Skipping retry for " + methodName
+ " (write-without-cleanup test)");
return false;
}
if (retryCount < MAX_RETRIES) {
retryCount++;
int multiplier = 1 << (retryCount - 1); // 1, 2, 4
int minDelay = Math.min(BASE_MIN_MS * multiplier, MAX_DELAY_MS);
int maxDelay = Math.min(BASE_MAX_MS * multiplier, MAX_DELAY_MS);
int delay = minDelay + random.nextInt(maxDelay - minDelay + 1);
System.out.println("[RetryAnalyzer] Retrying failed test: "
+ result.getTestClass().getName() + "." + methodName
+ " after " + delay + "ms delay"
+ " (attempt " + (retryCount + 1) + "/" + (MAX_RETRIES + 1) + ")");
try {
Thread.sleep(delay);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
return true;
}
return false;
}
}
26 changes: 26 additions & 0 deletions automation/src/main/java/listeners/RetryListener.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package listeners;

import org.testng.IAnnotationTransformer;
import org.testng.annotations.ITestAnnotation;

import java.lang.reflect.Constructor;
import java.lang.reflect.Method;

/**
* Annotation transformer that attaches {@link RetryAnalyzer} to every
* test method that does not already have a retry analyzer configured.
* <p>
* Register this listener in {@code @Listeners} on the base test class
* so all automation tests automatically get retry-on-failure behaviour.
*/
public class RetryListener implements IAnnotationTransformer {

@Override
public void transform(ITestAnnotation annotation, Class testClass,
Constructor testConstructor, Method testMethod) {
// TestNG 6.x: getRetryAnalyzer() returns IRetryAnalyzer instance (null if unset)
if (annotation.getRetryAnalyzer() == null) {
annotation.setRetryAnalyzer(RetryAnalyzer.class);
}
}
}
Loading
Loading