diff --git a/.github/workflows/dependency-submission.yml b/.github/workflows/dependency-submission.yml index e3ad74f9a..1c0858f93 100644 --- a/.github/workflows/dependency-submission.yml +++ b/.github/workflows/dependency-submission.yml @@ -1,3 +1,24 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Apache Cloudberry PXF Dependency Submission Workflow +# -------------------------------------------------------------------- name: Dependency Submission on: diff --git a/.github/workflows/pxf-ci.yml b/.github/workflows/pxf-ci.yml index 83188f352..3666ab8e2 100644 --- a/.github/workflows/pxf-ci.yml +++ b/.github/workflows/pxf-ci.yml @@ -1,3 +1,24 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Apache Cloudberry PXF CI Workflow +# -------------------------------------------------------------------- name: PXF CI Pipeline on: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6af259c58..02a2c06cc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,3 +1,22 @@ + + Apache Cloudberry community welcomes contributions from anyone, new and experienced! We appreciate your interest in contributing. This guide will help you get started with the contribution. diff --git a/DISCLAIMER b/DISCLAIMER new file mode 100644 index 000000000..144253767 --- /dev/null +++ b/DISCLAIMER @@ -0,0 +1,9 @@ +Apache Cloudberry is an effort undergoing incubation at The Apache +Software Foundation (ASF), sponsored by the Apache +Incubator. Incubation is required of all newly accepted projects until +a further review indicates that the infrastructure, communications, +and decision making process have stabilized in a manner consistent +with other successful ASF projects. While incubation status is not +necessarily a reflection of the completeness or stability of the code, +it does indicate that the project has yet to be fully endorsed by the +ASF. \ No newline at end of file diff --git a/LICENSE b/LICENSE index 4bd5496af..3c1f33f21 100644 --- a/LICENSE +++ b/LICENSE @@ -214,6 +214,84 @@ This product is derived from software originally developed by: notices and license terms. Your use of these subcomponents is subject to the terms and conditions of the subcomponent's license, as noted in the LICENSE file. + +The Greenplum Platform Extension Framework includes: + +---------------------------- + Apache License - Version 2.0 + +The following files are licensed under the Apache License, Version 2.0: + +FDW Module: + fdw/libchurl.c + fdw/libchurl.h + fdw/pxf_bridge.c + fdw/pxf_bridge.h + fdw/pxf_filter.c + fdw/pxf_filter.h + fdw/pxf_header.c + fdw/pxf_header.h + +External Table Module: + external-table/src/gpdbwritableformatter.c + external-table/src/libchurl.c + external-table/src/libchurl.h + external-table/src/pxfbridge.c + external-table/src/pxfbridge.h + external-table/src/pxffilters.c + external-table/src/pxffilters.h + external-table/src/pxfheaders.c + external-table/src/pxfheaders.h + external-table/src/pxfprotocol.c + external-table/src/pxfuriparser.c + external-table/src/pxfuriparser.h + external-table/test/pxffilters_test.c + external-table/test/pxfheaders_test.c + external-table/test/pxfprotocol_test.c + external-table/test/pxfuriparser_test.c + +Server Module (Java Sources): + server/build.gradle + server/gradle.properties + server/settings.gradle + server/pxf-api/src/main/java/org/apache/cloudberry/pxf/api/*.java + server/pxf-api/src/test/java/org/apache/cloudberry/pxf/api/**/*.java + server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/**/*.java + server/pxf-hbase/src/test/java/org/apache/cloudberry/pxf/plugins/hbase/**/*.java + server/pxf-hdfs/src/main/java/org/apache/cloudberry/pxf/plugins/hdfs/**/*.java + server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/**/*.java + server/pxf-hive/src/main/java/org/apache/cloudberry/pxf/plugins/hive/**/*.java + server/pxf-hive/src/test/java/org/apache/cloudberry/pxf/plugins/hive/**/*.java + server/pxf-jdbc/src/main/java/org/apache/cloudberry/pxf/plugins/jdbc/**/*.java + server/pxf-jdbc/src/test/java/org/apache/cloudberry/pxf/plugins/jdbc/**/*.java + server/pxf-json/src/main/java/org/apache/cloudberry/pxf/plugins/json/**/*.java + server/pxf-json/src/test/java/org/apache/cloudberry/pxf/plugins/json/**/*.java + server/pxf-service/src/main/java/org/apache/cloudberry/pxf/service/**/*.java + server/pxf-service/src/test/java/org/apache/cloudberry/pxf/service/**/*.java + +Documentation Templates: + docs/content/*.html.md.erb + +Configuration Files: + server/pxf-api/src/test/resources/pxf-profiles-default.xml + server/pxf-hive/src/test/resources/pxf-profiles-default.xml + server/pxf-jdbc/src/test/resources/log4j.properties + server/pxf-json/src/test/resources/log4j.properties + server/pxf-service/src/main/resources/pxf-profiles-default.xml + server/pxf-service/src/templates/conf/pxf-profiles.xml + server/pxf-service/src/test/resources/pxf-profiles-default.xml + +CI/Test Templates: + automation/src/test/resources/templates/zk/zoo.cfg + ci/singlecluster/templates/hadoop/etc/hadoop/core-site.xml + ci/singlecluster/templates/hadoop/etc/hadoop/hdfs-site.xml + ci/singlecluster/templates/hadoop/etc/hadoop/yarn-env.sh + ci/singlecluster/templates/hbase/conf/hbase-env.sh + ci/singlecluster/templates/hbase/conf/hbase-site.xml + ci/singlecluster/templates/ranger/install.properties + ci/singlecluster/templates/tez/conf/tez-site.xml + ci/singlecluster/templates/usersync/install.properties + ======================================================================= This product bundles Gradle Wrapper, which is licensed under diff --git a/Makefile b/Makefile index caaac20ed..826ed8d64 100644 --- a/Makefile +++ b/Makefile @@ -84,6 +84,9 @@ install-server: stage: rm -rf build/stage + make -C $(SOURCE_EXTENSION_DIR) stage + make -C cli stage + make -C server stage ifneq ($(SKIP_EXTERNAL_TABLE_PACKAGE_REASON),) @echo "Skipping staging FDW extension because $(SKIP_EXTERNAL_TABLE_PACKAGE_REASON)" $(eval PXF_MODULES := $(filter-out external-table,$(PXF_MODULES))) @@ -95,12 +98,12 @@ endif set -e ;\ GP_MAJOR_VERSION=$$(cat $(SOURCE_EXTENSION_DIR)/build/metadata/gp_major_version) ;\ GP_BUILD_ARCH=$$(cat $(SOURCE_EXTENSION_DIR)/build/metadata/build_arch) ;\ - PXF_PACKAGE_NAME=pxf-cbdb$${GP_MAJOR_VERSION}-$${PXF_VERSION}-$${GP_BUILD_ARCH} ;\ + PXF_PACKAGE_NAME=pxf-cloudberry$${GP_MAJOR_VERSION}-$${PXF_VERSION}-$${GP_BUILD_ARCH} ;\ mkdir -p build/stage/$${PXF_PACKAGE_NAME} ;\ cp -a $(SOURCE_EXTENSION_DIR)/build/stage/* build/stage/$${PXF_PACKAGE_NAME} ;\ cp -a cli/build/stage/* build/stage/$${PXF_PACKAGE_NAME} ;\ cp -a server/build/stage/* build/stage/$${PXF_PACKAGE_NAME} ;\ - echo $$(git rev-parse --verify HEAD) > build/stage/$${PXF_PACKAGE_NAME}/pxf/commit.sha ;\ + echo $$(git rev-parse --verify HEAD) > build/stage/$${PXF_PACKAGE_NAME}/commit.sha ;\ cp package/install_binary build/stage/$${PXF_PACKAGE_NAME}/install_component ;\ echo "===> PXF staging is complete <===" @@ -115,16 +118,15 @@ gppkg-rpm: rpm mkdir -p gppkg/deps GP_MAJOR_VERSION=$$(cat $(SOURCE_EXTENSION_DIR)/build/metadata/gp_major_version) cat package/gppkg_spec.yml.in | sed "s,#arch,`arch`," | sed "s,#os,$(TEST_OS)," | sed "s,#gppkgver,1.0," | sed "s,#gpver,1," > gppkg/gppkg_spec.yml - find build/rpmbuild/RPMS -name pxf-cbdb$(GP_MAJOR_VERSION)-*.rpm -exec cp {} gppkg/ \; - source $(GPHOME)/greenplum_path.sh && gppkg --build gppkg + find build/rpmbuild/RPMS -name pxf-cloudberry$(GP_MAJOR_VERSION)-*.rpm -exec cp {} gppkg/ \; + source $(GPHOME)/greenplum_path.sh || source $(GPHOME)/cloudberry-env.sh && gppkg --build gppkg -rpm: - make -C $(SOURCE_EXTENSION_DIR) stage - make -C cli stage - make -C server stage +rpm: stage set -e ;\ GP_MAJOR_VERSION=$$(cat $(SOURCE_EXTENSION_DIR)/build/metadata/gp_major_version) ;\ - PXF_FULL_VERSION=$${PXF_VERSION} ;\ + GP_BUILD_ARCH=$$(cat $(SOURCE_EXTENSION_DIR)/build/metadata/build_arch) ;\ + PXF_PACKAGE_NAME=pxf-cloudberry$${GP_MAJOR_VERSION}-${PXF_VERSION}-$${GP_BUILD_ARCH} ;\ + PXF_FULL_VERSION=${PXF_VERSION} ;\ PXF_MAIN_VERSION=$$(echo $${PXF_FULL_VERSION} | sed -E 's/(-SNAPSHOT|-rc[0-9]+)$$//') ;\ if [[ $${PXF_FULL_VERSION} == *"-SNAPSHOT" ]]; then \ PXF_RELEASE=SNAPSHOT; \ @@ -135,7 +137,7 @@ rpm: fi ;\ rm -rf build/rpmbuild ;\ mkdir -p build/rpmbuild/{BUILD,RPMS,SOURCES,SPECS} ;\ - cp -a build/stage/$${PXF_PACKAGE_NAME}/pxf/* build/rpmbuild/SOURCES ;\ + cp -a build/stage/$${PXF_PACKAGE_NAME}/* build/rpmbuild/SOURCES ;\ cp package/*.spec build/rpmbuild/SPECS/ ;\ rpmbuild \ --define "_topdir $${PWD}/build/rpmbuild" \ @@ -150,7 +152,7 @@ rpm-tar: rpm mkdir -p build/{stagerpm,distrpm} set -e ;\ GP_MAJOR_VERSION=$$(cat $(SOURCE_EXTENSION_DIR)/build/metadata/gp_major_version) ;\ - PXF_RPM_FILE=$$(find build/rpmbuild/RPMS -name pxf-cbdb$${GP_MAJOR_VERSION}-*.rpm) ;\ + PXF_RPM_FILE=$$(find build/rpmbuild/RPMS -name apache-cloudberry-pxf-incubating-*.rpm) ;\ PXF_RPM_BASE_NAME=$$(basename $${PXF_RPM_FILE%*.rpm}) ;\ PXF_PACKAGE_NAME=$${PXF_RPM_BASE_NAME%.*} ;\ mkdir -p build/stagerpm/$${PXF_PACKAGE_NAME} ;\ @@ -165,24 +167,24 @@ deb: stage PXF_MAIN_VERSION=$${PXF_VERSION//-SNAPSHOT/} ;\ if [[ $${PXF_VERSION} == *"-SNAPSHOT" ]]; then PXF_RELEASE=SNAPSHOT; else PXF_RELEASE=1; fi ;\ rm -rf build/debbuild ;\ - mkdir -p build/debbuild/usr/local/pxf-cbdb$${GP_MAJOR_VERSION}/$(TARGET_EXTENSION_DIR) ;\ - cp -a $(SOURCE_EXTENSION_DIR)/build/stage/* build/debbuild/usr/local/pxf-cbdb$${GP_MAJOR_VERSION}/$(TARGET_EXTENSION_DIR) ;\ - cp -a cli/build/stage/pxf/* build/debbuild/usr/local/pxf-cbdb$${GP_MAJOR_VERSION} ;\ - cp -a server/build/stage/pxf/* build/debbuild/usr/local/pxf-cbdb$${GP_MAJOR_VERSION} ;\ - echo $$(git rev-parse --verify HEAD) > build/debbuild/usr/local/pxf-cbdb$${GP_MAJOR_VERSION}/commit.sha ;\ + mkdir -p build/debbuild/usr/local/cloudberry-pxf/$(TARGET_EXTENSION_DIR) ;\ + cp -a $(SOURCE_EXTENSION_DIR)/build/stage/* build/debbuild/usr/local/cloudberry-pxf/ ;\ + cp -a cli/build/stage/* build/debbuild/usr/local/cloudberry-pxf ;\ + cp -a server/build/stage/* build/debbuild/usr/local/cloudberry-pxf ;\ + echo $$(git rev-parse --verify HEAD) > build/debbuild/usr/local/cloudberry-pxf/commit.sha ;\ mkdir build/debbuild/DEBIAN ;\ cp -a package/DEBIAN/* build/debbuild/DEBIAN/ ;\ - sed -i -e "s/%VERSION%/$${PXF_MAIN_VERSION}-$${PXF_RELEASE}/" -e "s/%MAINTAINER%/${VENDOR}/" build/debbuild/DEBIAN/control ;\ + sed -i -e "s/%VERSION%/$${PXF_MAIN_VERSION}-$${PXF_RELEASE}/" -e "s/%MAINTAINER%/${VENDOR}/" -e "s/%ARCH%/$$(dpkg --print-architecture)/" build/debbuild/DEBIAN/control ;\ dpkg-deb --build build/debbuild ;\ - mv build/debbuild.deb build/pxf-cbdb$${GP_MAJOR_VERSION}-$${PXF_MAIN_VERSION}-$${PXF_RELEASE}-ubuntu18.04-amd64.deb + mv build/debbuild.deb build/apache-cloudberry-pxf-incubating-$${PXF_MAIN_VERSION}-$${PXF_RELEASE}-$$(lsb_release -si | tr '[:upper:]' '[:lower:]')$$(lsb_release -sr)-$$(dpkg --print-architecture).deb deb-tar: deb rm -rf build/{stagedeb,distdeb} mkdir -p build/{stagedeb,distdeb} set -e ;\ GP_MAJOR_VERSION=$$(cat $(SOURCE_EXTENSION_DIR)/build/metadata/gp_major_version) ;\ - PXF_DEB_FILE=$$(find build/ -name pxf-cbdb$${GP_MAJOR_VERSION}*.deb) ;\ - PXF_PACKAGE_NAME=$$(dpkg-deb --field $${PXF_DEB_FILE} Package)-$$(dpkg-deb --field $${PXF_DEB_FILE} Version)-ubuntu18.04 ;\ + PXF_DEB_FILE=$$(find build/ -name apache-cloudberry-pxf-incubating*.deb) ;\ + PXF_PACKAGE_NAME=$$(dpkg-deb --field $${PXF_DEB_FILE} Package)-$$(dpkg-deb --field $${PXF_DEB_FILE} Version)-$$(lsb_release -si | tr '[:upper:]' '[:lower:]')$$(lsb_release -rs) ;\ mkdir -p build/stagedeb/$${PXF_PACKAGE_NAME} ;\ cp $${PXF_DEB_FILE} build/stagedeb/$${PXF_PACKAGE_NAME} ;\ cp package/install_deb build/stagedeb/$${PXF_PACKAGE_NAME}/install_component ;\ diff --git a/README.md b/README.md index 26bbc1c4c..e580cd5a7 100755 --- a/README.md +++ b/README.md @@ -1,8 +1,12 @@ # Platform Extension Framework (PXF) for Apache Cloudberry (Incubating) -[![Slack](https://img.shields.io/badge/Join_Slack-6a32c9)](https://communityinviter.com/apps/cloudberrydb/welcome) -[![Twitter Follow](https://img.shields.io/twitter/follow/cloudberrydb)](https://twitter.com/cloudberrydb) -[![Website](https://img.shields.io/badge/Visit%20Website-eebc46)](https://cloudberry.apache.org) +[![Website](https://img.shields.io/badge/Website-eebc46)](https://cloudberry.apache.org) +[![Documentation](https://img.shields.io/badge/Documentation-acd94a)](https://cloudberry.apache.org/docs) +[![Slack](https://img.shields.io/badge/Join_Slack-6a32c9)](https://inviter.co/apache-cloudberry) +[![Twitter Follow](https://img.shields.io/twitter/follow/ASFCloudberry)](https://twitter.com/ASFCloudberry) +[![WeChat](https://img.shields.io/badge/WeChat-eebc46)](https://cloudberry.apache.org/community/wechat) +[![Youtube](https://img.shields.io/badge/Youtube-gebc46)](https://youtube.com/@ApacheCloudberry) +[![GitHub Discussions](https://img.shields.io/github/discussions/apache/cloudberry)](https://github.com/apache/cloudberry/discussions) --- @@ -12,7 +16,7 @@ PXF is an extensible framework that allows a distributed database like Greenplum PXF includes built-in connectors for accessing data that exists inside HDFS files, Hive tables, HBase tables, JDBC-accessible databases and more. Users can also create their own connectors to other data storage or processing engines. -This project is forked from [greenplum/pxf](https://github.com/greenplum-db/pxf-archive) and customized for Apache Cloudberry. +This project is derived from [greenplum/pxf](https://github.com/greenplum-db/pxf-archive) and customized for Apache Cloudberry. ## Repository Contents @@ -23,20 +27,12 @@ This project is forked from [greenplum/pxf](https://github.com/greenplum-db/pxf- * `automation/` : Contains the automation and integration tests for PXF against the various datasources * `ci/` : Contains CI/CD environment and scripts (including singlecluster Hadoop environment) * `regression/` : Contains the end-to-end (integration) tests for PXF against the various datasources, utilizing the PostgreSQL testing framework `pg_regress` -* `downloads/` : An empty directory that serves as a staging location for Cloudberry RPMs for the development Docker image ## PXF Development Below are the steps to build and install PXF along with its dependencies including Cloudberry and Hadoop. -> [!Note] -> To start, ensure you have a `~/workspace` directory and have cloned the `pxf` and its prerequisites (shown below) under it. -(The name `workspace` is not strictly required but will be used throughout this guide.) - ```bash -mkdir -p ~/workspace -cd ~/workspace - git clone https://github.com/apache/cloudberry-pxf.git ``` @@ -49,22 +45,22 @@ To build PXF, you must have: Either download and install Cloudberry RPM or build Cloudberry from the source by following instructions in the [Cloudberry](https://github.com/apache/cloudberry). - Assuming you have installed Cloudberry into `/usr/local/cloudberrydb` directory, run its environment script: + Assuming you have installed Cloudberry into `/usr/local/cloudberry-db` directory, run its environment script: ``` - source /usr/local/cloudberrydb/greenplum_path.sh # For Cloudberry 2.0 - source /usr/local/cloudberrydb/cloudberry-env.sh # For Cloudberry 2.1+ + source /usr/local/cloudberry-db/greenplum_path.sh # For Cloudberry 2.0 + source /usr/local/cloudberry-db/cloudberry-env.sh # For Cloudberry 2.1+ ``` 3. JDK 1.8 or JDK 11 to compile/run Export your `JAVA_HOME`: ``` - export JAVA_HOME= + export JAVA_HOME=/usr/lib/jvm/java-11-openjdk ``` 4. Go (1.9 or later) - To install Go on CentOS, `sudo yum install go`. For other platforms, see the [Go downloads page](https://golang.org/dl/). + You can download and install Go via [Go downloads page](https://golang.org/dl/). Make sure to export your `GOPATH` and add go to your `PATH`. For example: ```shell @@ -78,46 +74,37 @@ To build PXF, you must have: go install github.com/onsi/ginkgo/ginkgo@latest ``` -5. cURL (7.29 or later): - - To install cURL devel package on CentOS 7, `sudo yum install libcurl-devel`. - - Note that CentOS 6 provides an older, unsupported version of cURL (7.19). You should install a newer version from source if you are on CentOS 6. - -### How to Build PXF +### Build PXF PXF uses Makefiles to build its components. PXF server component uses Gradle that is wrapped into the Makefile for convenience. ```bash -cd ~/workspace/pxf +cd cloudberry-pxf/ -# Compile & Test PXF +# Compile PXF make - -# Only run unit tests -make test ``` -### How to Install PXF +### Install PXF -To install PXF, first make sure that the user has sufficient permissions in the `$GPHOME` and `$PXF_HOME` directories to perform the installation. It's recommended to change ownership to match the installing user. For example, when installing PXF as user `gpadmin` under `/usr/local/cloudberrydb`: +To install PXF, first make sure that the user has sufficient permissions in the `$GPHOME` and `$PXF_HOME` directories to perform the installation. It's recommended to change ownership to match the installing user. For example, when installing PXF as user `gpadmin` under `/usr/local/cloudberry-db`: ```bash -export GPHOME=/usr/local/cloudberrydb -export PXF_HOME=/usr/local/pxf +mkdir -p /usr/local/cloudberry-pxf +export PXF_HOME=/usr/local/cloudberry-pxf export PXF_BASE=${HOME}/pxf-base -chown -R gpadmin:gpadmin "${GPHOME}" "${PXF_HOME}" -make -C ~/workspace/pxf install +chown -R gpadmin:gpadmin "${PXF_HOME}" +make install ``` NOTE: if `PXF_BASE` is not set, it will default to `PXF_HOME`, and server configurations, libraries or other configurations, might get deleted after a PXF re-install. -### How to Run PXF +### Run PXF -Ensure that PXF is in your path. This command can be added to your .bashrc +Ensure that PXF is in your path. This command can be added to your `.bashrc`: ```bash -export PATH=/usr/local/pxf/bin:$PATH +export PATH=/usr/local/cloudberry-pxf/bin:$PATH ``` Then you can prepare and start up PXF by doing the following. @@ -143,151 +130,13 @@ After PXF has been re-installed, you can restart the PXF instance using: pxf restart ``` -### How to demonstrate Hadoop Integration -In order to demonstrate end to end functionality you will need Hadoop installed. We have all the related hadoop components (hdfs, hive, hbase, zookeeper, etc) mapped into simple artifact named singlecluster. -You can [download from here](https://storage.googleapis.com/pxf-public/singlecluster-HDP.tar.gz) and untar the `singlecluster-HDP.tar.gz` file, which contains everything needed to run Hadoop. - -```bash -mv singlecluster-HDP.tar.gz ~/workspace/ -cd ~/workspace -tar xzf singlecluster-HDP.tar.gz -``` - -Create a symlink using `ln -s ~/workspace/singlecluster-HDP ~/workspace/singlecluster` and then follow the steps in [Setup Hadoop](####Setup-Hadoop). - -While PXF can run on either Java 8 or Java 11, please ensure that you are running Java 8 for hdfs, hadoop, etc. Please set your java version by seting your `JAVA_HOME` to the appropriate location. - -On a Mac, you can set your java version using `JAVA_HOME` like so: -``` -export JAVA_HOME=`/usr/libexec/java_home -v 1.8` -```` - -Initialize the default server configurations: -``` -cp ${PXF_HOME}/templates/*-site.xml ${PXF_BASE}/servers/default -``` - -### Development With Docker +## Development With Docker > [!Note] > Since the docker container will house all Single cluster Hadoop, Cloudberry and PXF, we recommend that you have at least 4 cpus and 6GB memory allocated to Docker. These settings are available under docker preferences. We provide a Docker-based development environment that includes Cloudberry, Hadoop, and PXF. See [automation/README.Docker.md](automation/README.Docker.md) for detailed instructions. -**Quick Start:** - -```bash -# Build and start the development container -docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml build -docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml up -d - -# Enter the container and run setup -docker exec -it pxf-cbdb-dev bash -c \ - "cd /home/gpadmin/workspace/cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu && ./script/entrypoint.sh" - -# Run tests -docker exec -it pxf-cbdb-dev bash -c \ - "cd /home/gpadmin/workspace/cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu && ./script/run_tests.sh" - -# Stop and clean up -docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml down -v -``` - -#### Setup Hadoop -Hdfs will be needed to demonstrate functionality. You can choose to start additional hadoop components (hive/hbase) if you need them. - -Setup [User Impersonation](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Superusers.html) prior to starting the hadoop components (this allows the `gpadmin` user to access hadoop data). - -The Docker development environment automatically configures Hadoop. For manual setup, see [automation/README.Docker.md](automation/README.Docker.md). - -Setup and start HDFS -```bash -pushd ~/workspace/singlecluster/bin -echo y | ./init-gphd.sh -./start-hdfs.sh -popd -``` - -Start other optional components based on your need - -```bash -pushd ~/workspace/singlecluster/bin -# Start Hive -./start-yarn.sh -./start-hive.sh - -# Start HBase -./start-zookeeper.sh -./start-hbase.sh -popd -``` - -#### Setup Minio (optional) -Minio is an S3-API compatible local storage solution. The development docker image comes with Minio software pre-installed. MinIO is automatically started by the Docker development environment. - -After the server starts, you can access Minio UI at `http://localhost:9000` from the host OS. Use `admin` for the access key and `password` for the secret key when connecting to your local Minio instance. - -To run S3 automation tests, set `PROTOCOL=minio`. If later you would like to run Hadoop HDFS tests, unset this variable with `unset PROTOCOL` command. - -#### Setup PXF - -Install PXF Server -```bash -# Install PXF -make -C ~/workspace/pxf install - -# Start PXF -export PXF_JVM_OPTS="-Xmx512m -Xms256m" -$PXF_HOME/bin/pxf start -``` - -Install PXF client (ignore if this is already done) -```bash -psql -d template1 -c "create extension pxf" -``` - -#### Run PXF Tests -All tests use a database named `pxfautomation`. -```bash -pushd ~/workspace/pxf/automation - -# Initialize default server configs using template -cp ${PXF_HOME}/templates/{hdfs,mapred,yarn,core,hbase,hive}-site.xml ${PXF_BASE}/servers/default - -# Run specific tests. Example: Hdfs Smoke Test -make TEST=HdfsSmokeTest - -# Run all tests. This will be very time consuming. -make GROUP=gpdb - -# If you wish to run test(s) against a different storage protocol set the following variable (for eg: s3) -export PROTOCOL=s3 -popd -``` - -If you see any HBase failures, try copying `pxf-hbase-*.jar` to the HBase classpath, and restart HBase: - -``` -cp ${PXF_HOME}/lib/pxf-hbase-*.jar ~/workspace/singlecluster/hbase/lib/pxf-hbase.jar -~/workspace/singlecluster/bin/stop-hbase.sh -~/workspace/singlecluster/bin/start-hbase.sh -``` - -#### Make Changes to PXF - -To deploy your changes to PXF in the development environment. - -```bash -# $PXF_HOME folder is replaced each time you make install. -# So, if you have any config changes, you may want to back those up. -$PXF_HOME/bin/pxf stop -make -C ~/workspace/pxf install -# Make any config changes you had backed up previously -rm -rf $PXF_HOME/pxf-service -yes | $PXF_HOME/bin/pxf init -$PXF_HOME/bin/pxf start -``` - ## IDE Setup (IntelliJ) - Start IntelliJ. Click "Open" and select the directory to which you cloned the `pxf` repo. @@ -311,47 +160,6 @@ no JDK set for Gradle. Just cancel and retry. It goes away the second time. - Debug the new configuration in IntelliJ - Run a query in CloudberryDB that uses PXF to debug with IntelliJ -## To run a Kerberized Hadoop Cluster - -### Requirements - -- Download bin_gpdb (from any of the pipelines) -- Download pxf_tarball (from any of the pipelines) - -These instructions allow you to run a Kerberized cluster. See [automation/README.Docker.md](automation/README.Docker.md) for detailed Kerberos setup instructions. - -```bash -docker run --rm -it \ - --privileged \ - --hostname c6401.ambari.apache.org \ - -p 5432:5432 \ - -p 5888:5888 \ - -p 8000:8000 \ - -p 8080:8080 \ - -p 8020:8020 \ - -p 9000:9000 \ - -p 9090:9090 \ - -p 50070:50070 \ - -w /home/gpadmin/workspace \ - -v ~/workspace/cbdb:/home/gpadmin/workspace/gpdb_src \ - -v ~/workspace/pxf:/home/gpadmin/workspace/pxf_src \ - -v ~/workspace/singlecluster-HDP:/home/gpadmin/workspace/singlecluster \ - -v ~/Downloads/bin_cbdb:/home/gpadmin/workspace/bin_cbdb \ - -v ~/Downloads/pxf_tarball:/home/gpadmin/workspace/pxf_tarball \ - -e CLUSTER_NAME=hdp \ - -e NODE=c6401.ambari.apache.org \ - -e REALM=AMBARI.APACHE.ORG \ - gcr.io/$PROJECT_ID/gpdb-pxf-dev/gpdb6-centos7-test-pxf-hdp2 /bin/bash - -# Inside the container, you can use the scripts in ci/docker/pxf-cbdb-dev/ubuntu/script to set up and run tests. - -echo "+----------------------------------------------+" -echo "| Kerberos admin principal: admin/admin@$REALM |" -echo "| Kerberos admin password : admin |" -echo "+----------------------------------------------+" - -su - gpadmin -``` ## Contribute diff --git a/automation/README.Docker.md b/automation/README.Docker.md index db0c3bc96..83fba9640 100644 --- a/automation/README.Docker.md +++ b/automation/README.Docker.md @@ -1,3 +1,22 @@ + + # Running Automation in Docker ## Prerequisites diff --git a/automation/README.Linux.md b/automation/README.Linux.md index 2d7e557bb..f4031c825 100644 --- a/automation/README.Linux.md +++ b/automation/README.Linux.md @@ -5,17 +5,18 @@ They are intended to be used in tandem with the information in the main README f ## Locale Setup -Automation creates a GPDB database using the `ru_RU.CP1251` locale. You can generate the required locale files with +Automation creates a Cloudberry database using the `ru_RU.CP1251` locale. You can generate the required locale files with ```sh sudo sed -i.bak -e 's/# ru_RU.CP1251.*/ru_RU.CP1251 CP1251/' /etc/locale.gen sudo locale-gen ``` -After generating the locale, restart your GPDB cluster +After generating the locale, restart your Cloudberry cluster ```sh -source $GPHOME/greenplum_path.sh +source $GPHOME/greenplum_path.sh # For Cloudberry 2.0 +source $GPHOME/cloudberry-env.sh # For Cloudberry 2.1+ gpstop -a gpstart -a ``` diff --git a/automation/README.md b/automation/README.md index c7236d026..8300d0795 100755 --- a/automation/README.md +++ b/automation/README.md @@ -130,7 +130,7 @@ Note: If you get an error saying that the jar does not exist, ensure that you ha - `src/main/java` - contains related classes and utilities for the test - `src/test/java` - contains the TestNG cases. - `sqlrepo` - contains SQL test cases. -- `src/main/java/org/greenplum/pxf/automation/components` - contains all the supported services/components with simple API abstractions. +- `src/main/java/org/apache/cloudberry/pxf/automation/components` - contains all the supported services/components with simple API abstractions. ### General Automation Architecture diff --git a/automation/pxf_regress/README.md b/automation/pxf_regress/README.md index 645cf7eab..078208069 100644 --- a/automation/pxf_regress/README.md +++ b/automation/pxf_regress/README.md @@ -4,7 +4,7 @@ `pxf_regress` is a PSQL test runner written in Go that is heavily inspired by `pg_regress`. PXF's automation test framework sets up data in external data -storage (e.g., Hadoop, Amazon S3, etc), creates Greenplum external tables to +storage (e.g., Hadoop, Amazon S3, etc), creates Cloudberry external tables to work with these data sets, and then invokes `pxf_regress` to run SQL test cases via `psql` and compare the results with expected output. Instead of matching the features of `pg_regress` exactly, this utility currently implements the @@ -36,16 +36,16 @@ small_data └── query02.sql ``` -There are no command line flags; the GPDB cluster that `pxf_regress` connects +There are no command line flags; the Cloudberry cluster that `pxf_regress` connects to can be customized with standard [Postgres environment variables][1]. ### Why not use `pg_regress`? -Ideally, PXF would re-use `pg_regress` which is included with upstream GPDB; -however, PXF supports multiple GPDB versions (currently 5, 6, & 7) with a -single code base. Differences between the GPDB major versions and the included +Ideally, PXF would re-use `pg_regress` which is included with upstream Cloudberry; +however, PXF supports multiple Cloudberry versions with a +single code base. Differences between the Cloudberry major versions and the included `pg_regress` results in non-semantically meaningful (for PXF) differences. -GPDB's version of `pg_regress` uses a utility called `gpdiff.pl` to compare +Cloudberry's version of `pg_regress` uses a utility called `gpdiff.pl` to compare actual test output with expected test output. From the description of [`gpdiff.pl`][2]: @@ -56,9 +56,9 @@ actual test output with expected test output. From the description of > single PostgreSQL instance. When `pg_regress` runs `gpdiff.pl`, it runs the version of `gpdiff.pl` that is -included with GPDB (`$($GPHOME/bin/pg_config +included with Cloudberry (`$($GPHOME/bin/pg_config --libdir)/postgresql/pgxs/src/test/regress/gpdiff.pl`) with hard-coded options -that cannot be customized. Not only is `gpdiff.pl` different across GPDB major +that cannot be customized. Not only is `gpdiff.pl` different across Cloudberry major versions, the set of options that `pg_regress` runs it with will be different across major versions. @@ -119,4 +119,4 @@ $ tree smoke/small_data ``` [1]: https://www.postgresql.org/docs/12/libpq-envars.html -[2]: https://github.com/greenplum-db/gpdb/blob/main/src/test/regress/gpdiff.pl +[2]: https://github.com/apache/cloudberry/blob/main/src/test/regress/gpdiff.pl diff --git a/automation/src/main/java/org/apache/cloudberry/pxf/automation/components/common/ShellSystemObject.java b/automation/src/main/java/org/apache/cloudberry/pxf/automation/components/common/ShellSystemObject.java index 3cdb1f156..e8dad0d46 100755 --- a/automation/src/main/java/org/apache/cloudberry/pxf/automation/components/common/ShellSystemObject.java +++ b/automation/src/main/java/org/apache/cloudberry/pxf/automation/components/common/ShellSystemObject.java @@ -57,7 +57,7 @@ public class ShellSystemObject extends BaseSystemObject { "GPHOME", "GPHD_ROOT", "GPDATA", - "MASTER_DATA_DIRECTORY", + "COORDINATOR_DATA_DIRECTORY", "PGPORT", "PGHOST", "PGDATABASE" diff --git a/automation/src/main/java/org/apache/cloudberry/pxf/automation/components/gpdb/Gpdb.java b/automation/src/main/java/org/apache/cloudberry/pxf/automation/components/gpdb/Gpdb.java index 201881f6b..ad04a97f6 100755 --- a/automation/src/main/java/org/apache/cloudberry/pxf/automation/components/gpdb/Gpdb.java +++ b/automation/src/main/java/org/apache/cloudberry/pxf/automation/components/gpdb/Gpdb.java @@ -22,7 +22,7 @@ public class Gpdb extends DbSystemObject { private static final String DEFAULT_PORT = "5432"; - private static final String GREENPLUM_DATABASE_PREFIX = "Greenplum Database "; + private static final String APACHE_CLOUDBERRY_PREFIX = "Apache Cloudberry "; private static final String IF_NOT_EXISTS_OPTION = "IF NOT EXISTS"; private String sshUserName; @@ -580,8 +580,8 @@ private int determineVersion() throws Exception { res.next(); String fullVersion = res.getString(1); ReportUtils.report(report, getClass(), "Retrieved from Greenplum: [" + fullVersion + "]"); - int gpIndex = fullVersion.indexOf(GREENPLUM_DATABASE_PREFIX); // where the version prefix starts - String prefix = GREENPLUM_DATABASE_PREFIX; + int gpIndex = fullVersion.indexOf(APACHE_CLOUDBERRY_PREFIX); // where the version prefix starts + String prefix = APACHE_CLOUDBERRY_PREFIX; // Cloudberry forks print strings like: // "PostgreSQL 14.4 (Apache Cloudberry 3.0.0-devel build dev) ..." // fall back to the Cloudberry prefix if the Greenplum one is missing diff --git a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java index 18218eac1..0bd9b611d 100644 --- a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java +++ b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java @@ -210,6 +210,11 @@ public void parquetWritePrimitivesZStd() throws Exception { runWritePrimitivesScenario("pxf_parquet_write_primitives_zstd", "pxf_parquet_read_primitives_zstd", "parquet_write_primitives_zstd", new String[]{"COMPRESSION_CODEC=zstd"}); } + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesLZ4_RAW() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_lz4_raw", "pxf_parquet_read_primitives_lz4_raw", "parquet_write_primitives_lz4_raw", new String[]{"COMPRESSION_CODEC=lz4_raw"}); + } + // Numeric precision not defined, test writing data precision in [1, 38]. All the data should be written correctly. @Test(groups = {"features", "gpdb", "security", "hcfs"}) public void parquetWriteUndefinedPrecisionNumeric() throws Exception { diff --git a/automation/src/test/resources/sut/LocalToIPAMultiNodeHadoopHA.xml b/automation/src/test/resources/sut/LocalToIPAMultiNodeHadoopHA.xml index 437ac5a36..284baf493 100644 --- a/automation/src/test/resources/sut/LocalToIPAMultiNodeHadoopHA.xml +++ b/automation/src/test/resources/sut/LocalToIPAMultiNodeHadoopHA.xml @@ -88,7 +88,7 @@ make TEST=HdfsHAFailoverTest - + diff --git a/automation/src/test/resources/sut/MultiHadoopIPAMultiNodesCluster.xml b/automation/src/test/resources/sut/MultiHadoopIPAMultiNodesCluster.xml index dc9ced06f..6b18b05c1 100644 --- a/automation/src/test/resources/sut/MultiHadoopIPAMultiNodesCluster.xml +++ b/automation/src/test/resources/sut/MultiHadoopIPAMultiNodesCluster.xml @@ -128,7 +128,7 @@ - + diff --git a/automation/src/test/resources/sut/MultiHadoopMultiNodesCluster.xml b/automation/src/test/resources/sut/MultiHadoopMultiNodesCluster.xml index 167d1507f..a6195c61f 100644 --- a/automation/src/test/resources/sut/MultiHadoopMultiNodesCluster.xml +++ b/automation/src/test/resources/sut/MultiHadoopMultiNodesCluster.xml @@ -113,7 +113,7 @@ - + diff --git a/automation/src/test/resources/sut/MultiNodesCluster.xml b/automation/src/test/resources/sut/MultiNodesCluster.xml index a0f01e564..5d3e0ff08 100644 --- a/automation/src/test/resources/sut/MultiNodesCluster.xml +++ b/automation/src/test/resources/sut/MultiNodesCluster.xml @@ -87,7 +87,7 @@ - + diff --git a/automation/src/test/resources/sut/default.xml b/automation/src/test/resources/sut/default.xml index ed24017a1..7c9c4b689 100644 --- a/automation/src/test/resources/sut/default.xml +++ b/automation/src/test/resources/sut/default.xml @@ -97,7 +97,7 @@ - + diff --git a/automation/src/test/resources/templates/gpdb/gpinitsystem_config b/automation/src/test/resources/templates/gpdb/gpinitsystem_config index bbd1b3a78..9940ce05f 100755 --- a/automation/src/test/resources/templates/gpdb/gpinitsystem_config +++ b/automation/src/test/resources/templates/gpdb/gpinitsystem_config @@ -25,14 +25,14 @@ PORT_BASE=40000 DATA_DIRECTORY=(/data/gpdb/p1 /data/gpdb/p2) #### OS-configured hostname or IP address of the master host. -MASTER_HOSTNAME=centos64-1 +COORDINATOR_HOSTNAME=centos64-1 -#### File system location where the master data directory +#### File system location where the coordinator data directory #### will be created. -MASTER_DIRECTORY=/data/gpdb/master +COORDINATOR_DIRECTORY=/data/gpdb/coordinator #### Port number for the master instance. -MASTER_PORT=5432 +COORDINATOR_PORT=5432 #### Shell utility used to connect to remote hosts. TRUSTED_SHELL=ssh diff --git a/ci/README.md b/ci/README.md deleted file mode 100644 index 2af1578a1..000000000 --- a/ci/README.md +++ /dev/null @@ -1,143 +0,0 @@ -# Concourse pipeline deployment -To facilitate pipeline maintenance, a Python utility 'deploy` -is used to generate the different pipelines for PXF main, -PXF 5x and release pipelines. It also allows the generation -of acceptance and custom pipelines for developers to use. - -The utility uses the [Jinja2](http://jinja.pocoo.org/) template -engine for Python. This allows the generation of portions of the -pipeline from common blocks of pipeline code. Logic (Python code) can -be embedded to further manipulate the generated pipeline. - -# Deploy the `pxf-build` (release) pipeline - -To deploy the build pipeline for PXF, make sure PXF main branch is currently checked-out and run this command: - -```shell script -make -C "${HOME}/workspace/pxf/concourse" build -``` - -# Deploy the `pxf-certification` (release) pipeline - -To deploy the certifcation pipeline (forward compatibility) for PXF, make sure PXF main branch is currently checked-out and run this command: - -```shell script -make -C "${HOME}/workspace/pxf/concourse" certification -``` - -# Deploy the singlecluster pipeline - -The singlecluster pipeline generates the singlecluster tarball for CDH, HDP2, -and HDP3. The generated tarballs are then published to an S3 and GCS bucket. -The produced tarballs can then be consumed in the pxf-build pipelines. - -```shell script -make -C "${HOME}/workspace/pxf/concourse" singlecluster -``` - -# Deploy the cloudbuild pipeline - -```shell script -make -C "${HOME}/workspace/pxf/concourse" cloudbuild -``` - -# Deploy the pull-request pipeline - -```shell script -make -C "${HOME}/workspace/pxf/concourse" pr -``` - -# Deploy the performance pipelines - -10G Performance pipeline: - -```shell script -make SCALE=10 -C "${HOME}/workspace/pxf/concourse" perf -``` - -You can deploy a development version of the perf pipeline by substituting the name -of your development branch into `pxf-git-branch=main`. Also, make sure to change -the name of your development pipeline (i.e. `-p dev:`). - -50G Performance pipeline: - -```shell script -make SCALE=50 -C "${HOME}/workspace/pxf/concourse" perf -``` - -500G Performance pipeline: - -```shell script -make SCALE=500 -C "${HOME}/workspace/pxf/concourse" perf -``` - -By default, these pipelines run perf on RHEL7. -If you would like to run pipelines using RHEL8, please include `REDHAT_MAJOR_VERSION=8` to the command. -Ex: `make SCALE=10 REDHAT_MAJOR_VERSION=8 -C "${HOME}/workspace/pxf/concourse" perf` - -# Deploy development PXF release pipelines - -The dev release pipeline performs most functions of the `pxf-build` release pipeline except for the tagging and bumping of the build version. - -To deploy dev release pipeline, use: - -```shell -make -C "${HOME}/workspace/pxf/concourse" dev-release -``` - -# Deploy development PXF pipelines - -The dev pipeline is an abbreviated version of the `pxf-build` pipeline. - -To deploy dev pipeline against gpdb 5X_STABLE and 6X_STABLE branches, use: - -```shell -make -C "${HOME}/workspace/pxf/concourse" dev -``` - -To deploy multi-node dev pipeline, you can specify the following options -* `MULTINODE_EL7=` for EL7 -* `MULTINODE_EL8=` for EL8 -* `MULTINODE_EL9=` for EL9 -* `MULTINODE_NO_IMPERSONATION=` for EL7, which will also run CLI tests - -```shell -MULTINODE_EL7=true make -C "${HOME}/workspace/pxf/concourse" dev -``` - -This command will automatically point the pipeline at your currently checked-out branch of PXF. - -# Deploy Longevity Testing PXF pipeline -The longevity testing pipeline is designed to work off a PXF tag that needs to be provided as a parameter when -creating the pipeline. The generated pipeline compiles PXF, creates a Greenplum CCP cluster and 2 secure dataproc clusters -and runs a multi-cluster security test every 15 minutes. CCP cluster is set with expiration time of more than 6 months, so -it needs to be cleaned manually and so do the dataproc clusters. - -```shell -YOUR_TAG= make -C "${HOME}/workspace/pxf/concourse" longevity -``` - -## Uploading a new Apache Maven 3 version - -The CI pipelines for PXF run automation tests using Apache Maven 3.x. Instead of downloading this directly from the Apache -mirrors or Apache archive, we store a copy in Google Cloud Storage to use when we create our images in Cloudbuild. -Typically, we will not be updating these values very often. However, if we need to upload a new version of Maven, you -can use a snippet like this one to download and then upload to GCS. - -```bash -./scripts/download-maven-from-apache-mirror.sh -gcloud storage cp ../downloads/apache-maven--bin.tar.gz gs://data-gpdb-ud-pxf-build-resources/apache-maven - -# Example for Apache Maven 3.9.2 -./scripts/download-spark-from-apache-mirror.sh 3.9.2 -gcloud storage cp ../downloads/apache-maven-3.9.2-bin.tar.gz gs://data-gpdb-ud-pxf-build-resources/apache-maven - -# Example for Apache Maven 3 Latest -$ ./scripts/download-spark-from-apache-mirror.sh latest -> Looking for latest maven-3 version... -> Latest maven version determined to be: 3.9.3 -> Would you like to proceed (y/n)? y - -gcloud storage cp ../downloads/apache-maven-3.9.3-bin.tar.gz gs://data-gpdb-ud-pxf-build-resources/apache-maven - -``` diff --git a/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint_kerberos.sh b/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint_kerberos.sh index f64fabeea..52a26f351 100755 --- a/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint_kerberos.sh +++ b/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint_kerberos.sh @@ -35,7 +35,7 @@ ADMIN_PASS=${ADMIN_PASS:-AdminPass@123} PXF_BASE=${PXF_BASE:-/home/gpadmin/pxf-base} GPHOME=${GPHOME:-/usr/local/cloudberry-db} # GPDB demo master path is required by pg_hba reloads; define a default up front. -MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY:-/home/gpadmin/workspace/cloudberry/gpAux/gpdemo/datadirs/qddir/demoDataDir-1} +COORDINATOR_DATA_DIRECTORY=${COORDINATOR_DATA_DIRECTORY:-/home/gpadmin/workspace/cloudberry/gpAux/gpdemo/datadirs/qddir/demoDataDir-1} # Java locations vary by arch; prefer Java 8 for Hadoop runtime and Java 11 for builds if needed. JAVA_11_ARM=/usr/lib/jvm/java-11-openjdk-arm64 @@ -844,8 +844,8 @@ configure_pg_hba() { } | awk '!seen[$0]++' | sudo tee "${tmp_pg_hba}" >/dev/null sudo mv "${tmp_pg_hba}" "${PG_HBA}" # Reload cluster so new HBA rules take effect immediately for test users. - if [ -n "${MASTER_DATA_DIRECTORY}" ] && [ -x "${GPHOME}/bin/pg_ctl" ]; then - sudo -u gpadmin env MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY} GPHOME=${GPHOME} "${GPHOME}/bin/pg_ctl" reload -D "${MASTER_DATA_DIRECTORY}" >/dev/null 2>&1 || true + if [ -n "${COORDINATOR_DATA_DIRECTORY}" ] && [ -x "${GPHOME}/bin/pg_ctl" ]; then + sudo -u gpadmin env COORDINATOR_DATA_DIRECTORY=${COORDINATOR_DATA_DIRECTORY} GPHOME=${GPHOME} "${GPHOME}/bin/pg_ctl" reload -D "${COORDINATOR_DATA_DIRECTORY}" >/dev/null 2>&1 || true fi } @@ -875,7 +875,7 @@ ensure_gpdb_databases() { sudo -u gpadmin env ${env_path} "${createdb_bin}" "${conn_flags[@]}" -E UTF8 pxfautomation_encoding >/dev/null 2>&1 || true fi - sudo -u gpadmin env MASTER_DATA_DIRECTORY="${mdd}" GPHOME="${gphome}" "${gphome}/bin/pg_ctl" reload -D "${mdd}" >/dev/null 2>&1 || true + sudo -u gpadmin env COORDINATOR_DATA_DIRECTORY="${mdd}" GPHOME="${gphome}" "${gphome}/bin/pg_ctl" reload -D "${mdd}" >/dev/null 2>&1 || true } verify_security_mode() { @@ -1074,7 +1074,7 @@ init_test_env() { export PGPORT=${PGPORT:-7000} export PGDATABASE=${PGDATABASE:-pxfautomation} export PGUSER=${PGUSER:-gpadmin} - export MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY:-/home/gpadmin/workspace/cloudberry/gpAux/gpdemo/datadirs/qddir/demoDataDir-1} + export COORDINATOR_DATA_DIRECTORY=${COORDINATOR_DATA_DIRECTORY:-/home/gpadmin/workspace/cloudberry/gpAux/gpdemo/datadirs/qddir/demoDataDir-1} export GPHOME=${GPHOME:-/usr/local/cloudberry-db} export PATH=/usr/local/bin:${GPHOME}/bin:${PATH} export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/home/gpadmin/workspace/singlecluster/hadoop/etc/hadoop} @@ -1137,19 +1137,19 @@ EOS pgrep -f sshd >/dev/null 2>&1 || sudo service ssh start >/dev/null 2>&1 || true if ! pgrep -f "${GPHOME}/bin/postgres" >/dev/null 2>&1; then - sudo -u gpadmin env MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY} GPHOME=${GPHOME} "${GPHOME}/bin/gpstart" -a >/dev/null 2>&1 || true + sudo -u gpadmin env COORDINATOR_DATA_DIRECTORY=${COORDINATOR_DATA_DIRECTORY} GPHOME=${GPHOME} "${GPHOME}/bin/gpstart" -a >/dev/null 2>&1 || true fi if [ -f "${PG_HBA}" ] && ! grep -q "mdw/32 trust" "${PG_HBA}"; then sed -i '1ihost all all mdw/32 trust' "${PG_HBA}" || echo "host all all mdw/32 trust" | sudo tee -a "${PG_HBA}" >/dev/null - sudo -u gpadmin env MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY} GPHOME=${GPHOME} "${GPHOME}/bin/pg_ctl" reload -D "${MASTER_DATA_DIRECTORY}" >/dev/null 2>&1 || true + sudo -u gpadmin env COORDINATOR_DATA_DIRECTORY=${COORDINATOR_DATA_DIRECTORY} GPHOME=${GPHOME} "${GPHOME}/bin/pg_ctl" reload -D "${COORDINATOR_DATA_DIRECTORY}" >/dev/null 2>&1 || true fi if [ -f "${PG_HBA}" ] && ! grep -q "172.18.0.0/16" "${PG_HBA}"; then sed -i '1ihost all all 172.18.0.0/16 trust' "${PG_HBA}" || echo "host all all 172.18.0.0/16 trust" | sudo tee -a "${PG_HBA}" >/dev/null - sudo -u gpadmin env MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY} GPHOME=${GPHOME} "${GPHOME}/bin/pg_ctl" reload -D "${MASTER_DATA_DIRECTORY}" >/dev/null 2>&1 || true + sudo -u gpadmin env COORDINATOR_DATA_DIRECTORY=${COORDINATOR_DATA_DIRECTORY} GPHOME=${GPHOME} "${GPHOME}/bin/pg_ctl" reload -D "${COORDINATOR_DATA_DIRECTORY}" >/dev/null 2>&1 || true fi sudo -u gpadmin env PGHOST=${PGHOST} PGPORT=${PGPORT} PGUSER=${PGUSER} "${GPHOME}/bin/createdb" -T template1 pxfautomation >/dev/null 2>&1 || true sudo -u gpadmin env PGHOST=${PGHOST} PGPORT=${PGPORT} PGUSER=${PGUSER} "${GPHOME}/bin/createdb" -T template0 --encoding=WIN1251 --lc-collate=C --lc-ctype=C pxfautomation_encoding >/dev/null 2>&1 || true - ensure_gpdb_databases "${PGHOST}" "${PGPORT}" "${GPHOME}" "${MASTER_DATA_DIRECTORY}" + ensure_gpdb_databases "${PGHOST}" "${PGPORT}" "${GPHOME}" "${COORDINATOR_DATA_DIRECTORY}" for stub in pxf-pre-gpupgrade pxf-post-gpupgrade; do if [ ! -x "/usr/local/bin/${stub}" ]; then sudo tee "/usr/local/bin/${stub}" >/dev/null <<'SH' diff --git a/ci/docker/pxf-cbdb-dev/ubuntu/script/pxf-env.sh b/ci/docker/pxf-cbdb-dev/ubuntu/script/pxf-env.sh index 545885164..2743d4a26 100755 --- a/ci/docker/pxf-cbdb-dev/ubuntu/script/pxf-env.sh +++ b/ci/docker/pxf-cbdb-dev/ubuntu/script/pxf-env.sh @@ -35,7 +35,7 @@ export COMMON_JAVA_OPTS=${COMMON_JAVA_OPTS:-} # -------------------------------------------------------------------- export PGHOST=${PGHOST:-localhost} export PGPORT=${PGPORT:-7000} -export MASTER_DATA_DIRECTORY=${MASTER_DATA_DIRECTORY:-/home/gpadmin/workspace/cloudberry/gpAux/gpdemo/datadirs/qddir/demoDataDir-1} +export COORDINATOR_DATA_DIRECTORY=${COORDINATOR_DATA_DIRECTORY:-/home/gpadmin/workspace/cloudberry/gpAux/gpdemo/datadirs/qddir/demoDataDir-1} # set cloudberry timezone utc export PGTZ=UTC diff --git a/ci/singlecluster/Dockerfile b/ci/singlecluster/Dockerfile index 8e8c4621a..abb60e3cc 100644 --- a/ci/singlecluster/Dockerfile +++ b/ci/singlecluster/Dockerfile @@ -12,14 +12,14 @@ RUN sudo apt-get update && \ ENV HADOOP_VERSION=3.1.2 ENV HIVE_VERSION=3.1.3 ENV ZOOKEEPER_VERSION=3.5.9 -ENV HBASE_VERSION=2.0.6 +ENV HBASE_VERSION=2.3.7 ENV TEZ_VERSION=0.9.2 # checksums from archive.apache.org ENV HADOOP_SHA512="0e0ee817c89b3c4eb761eca7f16640742a83b0e99b6fda26c1bee2baabedad93aab86e252bf5f1e2381c6d464bc4003d10c7cc0f61b2062f4c59732ca24d1bd9" ENV HIVE_SHA256="0c9b6a6359a7341b6029cc9347435ee7b379f93846f779d710b13f795b54bb16" ENV ZOOKEEPER_SHA512="0e5a64713abc6f36d961dd61a06f681868171a9d9228366e512a01324806d263e05508029c94d8e18307811867cdc39d848e736c252bf56c461273ef74c66a45" -ENV HBASE_SHA512="a0e10904ecf7f059b77bc0ce704254046a978126db720cc7e55dc53b87097715da64b8391fe3cc94348bc432871ad8f29891dc8df1ea052eb628da0fdca97c93" +ENV HBASE_SHA512="1032521025660daa70260cdc931f52a26c87596be444451fe1fa88b526ede55e9d6b4220e91ff6f7422bec11f30d64fa6745e95a9c36971fdb1a264a2c745693" ENV TEZ_SHA512="a2d94bd9fa778d42a8bac9d9da8e263e469ddfef93968b06434716554995f490231de5607541ac236e770aa0158b64250c38bc1cd57dbfa629fea705f2ffa2f5" # faster mirror: @@ -63,7 +63,7 @@ RUN mkdir -p $ZOOKEEPER_ROOT && \ RUN mkdir -p $HBASE_ROOT && \ curl -fSL "$HBASE_URL" -o hbase.tar.gz && \ echo "$HBASE_SHA512 hbase.tar.gz" | sha512sum -c && \ - tar xvf hbase.tar.gz -C $HBASE_ROOT --strip-components 1 --exclude="docs/*" && \ + tar xvf hbase.tar.gz -C $HBASE_ROOT --strip-components 1 --exclude="docs/*" --exclude="lib/*-tests.jar" --exclude="lib/shaded-clients" && \ rm hbase.tar.gz RUN mkdir -p $TEZ_ROOT && \ diff --git a/ci/singlecluster/README.HDP3.md b/ci/singlecluster/README.HDP3.md index 16506de11..3a906e50a 100644 --- a/ci/singlecluster/README.HDP3.md +++ b/ci/singlecluster/README.HDP3.md @@ -7,7 +7,7 @@ It contains the following versions: - Hadoop 3.3.6 - Hive 3.1.3 - Zookeeper 3.5.9 -- HBase 2.0.6 +- HBase 2.3.7 - Tez 0.9.2 This version of Single cluster requires users to make some manual changes to the configuration files once the tarball has been unpacked (see Initialization steps below). diff --git a/docs/book/config.yml b/docs/book/config.yml index 361733b4e..ec750a963 100644 --- a/docs/book/config.yml +++ b/docs/book/config.yml @@ -1,4 +1,4 @@ -book_repo: greenplum-db/docs/book +book_repo: apache/cloudberry-pxf/docs/book public_host: localhost:9292 @@ -9,13 +9,13 @@ sections: subnav_template: pxf-subnav template_variables: - book_title: Greenplum Database PXF Documentation - book_title_short: Greenplum Database PXF Docs - domain_name: greenplum.org - product_link: - product_url: https://greenplum.org - support_call_to_action: Need Support? - support_link: Wiki - support_url: https://greenplum.org + book_title: Apache Cloudberry PXF Documentation + book_title_short: Apache Cloudberry PXF Docs + domain_name: cloudberry.apache.org + product_link: + product_url: https://cloudberry.apache.org + support_call_to_action: Need Support? + support_link: GitHub + support_url: https://cloudberry.apache.org broken_link_exclusions: iefix|arrowhead diff --git a/docs/content/access_hdfs.html.md.erb b/docs/content/access_hdfs.html.md.erb index 30babaa21..e5a970ec5 100644 --- a/docs/content/access_hdfs.html.md.erb +++ b/docs/content/access_hdfs.html.md.erb @@ -25,34 +25,34 @@ PXF is compatible with Cloudera, Hortonworks Data Platform, and generic Apache H ## Architecture -HDFS is the primary distributed storage mechanism used by Apache Hadoop. When a user or application performs a query on a PXF external table that references an HDFS file, the Greenplum Database coordinator host dispatches the query to all segment instances. Each segment instance contacts the PXF Service running on its host. When it receives the request from a segment instance, the PXF Service: +HDFS is the primary distributed storage mechanism used by Apache Hadoop. When a user or application performs a query on a PXF external table that references an HDFS file, the Apache Cloudberry coordinator host dispatches the query to all segment instances. Each segment instance contacts the PXF Service running on its host. When it receives the request from a segment instance, the PXF Service: 1. Allocates a worker thread to serve the request from the segment instance. 2. Invokes the HDFS Java API to request metadata information for the HDFS file from the HDFS NameNode. Figure: PXF-to-Hadoop Architecture -![Greenplum Platform Extenstion Framework to Hadoop Architecture](graphics/pxfarch.png "Greenplum Platform Extension Framework-to-Hadoop Architecture") +![Greenplum Platform Extenstion Framework to Hadoop Architecture](graphics/pxfarch.png "Apache Cloudberry Platform Extension Framework-to-Hadoop Architecture") -A PXF worker thread works on behalf of a segment instance. A worker thread uses its Greenplum Database `gp_segment_id` and the file block information described in the metadata to assign itself a specific portion of the query data. This data may reside on one or more HDFS DataNodes. +A PXF worker thread works on behalf of a segment instance. A worker thread uses its Apache Cloudberry `gp_segment_id` and the file block information described in the metadata to assign itself a specific portion of the query data. This data may reside on one or more HDFS DataNodes. -The PXF worker thread invokes the HDFS Java API to read the data and delivers it to the segment instance. The segment instance delivers its portion of the data to the Greenplum Database coordinator host. This communication occurs across segment hosts and segment instances in parallel. +The PXF worker thread invokes the HDFS Java API to read the data and delivers it to the segment instance. The segment instance delivers its portion of the data to the Apache Cloudberry coordinator host. This communication occurs across segment hosts and segment instances in parallel. ## Prerequisites Before working with Hadoop data using PXF, ensure that: -- You have configured PXF, and PXF is running on each Greenplum Database host. See [Configuring PXF](instcfg_pxf.html) for additional information. +- You have configured PXF, and PXF is running on each Apache Cloudberry host. See [Configuring PXF](instcfg_pxf.html) for additional information. - You have configured the PXF Hadoop Connectors that you plan to use. Refer to [Configuring PXF Hadoop Connectors](client_instcfg.html) for instructions. If you plan to access JSON-formatted data stored in a Cloudera Hadoop cluster, PXF requires a Cloudera version 5.8 or later Hadoop distribution. -- If user impersonation is enabled (the default), ensure that you have granted read (and write as appropriate) permission to the HDFS files and directories that will be accessed as external tables in Greenplum Database to each Greenplum Database user/role name that will access the HDFS files and directories. If user impersonation is not enabled, you must grant this permission to the `gpadmin` user. -- Time is synchronized between the Greenplum Database hosts and the external Hadoop systems. +- If user impersonation is enabled (the default), ensure that you have granted read (and write as appropriate) permission to the HDFS files and directories that will be accessed as external tables in Apache Cloudberry to each Apache Cloudberry user/role name that will access the HDFS files and directories. If user impersonation is not enabled, you must grant this permission to the `gpadmin` user. +- Time is synchronized between the Apache Cloudberry hosts and the external Hadoop systems. ## HDFS Shell Command Primer Examples in the PXF Hadoop topics access files on HDFS. You can choose to access files that already exist in your HDFS cluster. Or, you can follow the steps in the examples to create new files. -A Hadoop installation includes command-line tools that interact directly with your HDFS file system. These tools support typical file system operations that include copying and listing files, changing file permissions, and so forth. You run these tools on a system with a Hadoop client installation. By default, Greenplum Database hosts do not +A Hadoop installation includes command-line tools that interact directly with your HDFS file system. These tools support typical file system operations that include copying and listing files, changing file permissions, and so forth. You run these tools on a system with a Hadoop client installation. By default, Apache Cloudberry hosts do not include a Hadoop client installation. The HDFS file system command syntax is `hdfs dfs []`. Invoked with no options, `hdfs dfs` lists the file system options supported by the tool. @@ -103,26 +103,26 @@ The PXF Hadoop connectors provide built-in profiles to support the following dat The PXF Hadoop connectors expose the following profiles to read, and in many cases write, these supported data formats: -| Data Source | Data Format | Profile Name(s) | Deprecated Profile Name | Supported Operations | +| Data Source | Data Format | Profile Name(s) | Foreign Data Wrapper format | Supported Operations | |-------------|------|---------|-----|-----| -| HDFS | delimited single line [text](hdfs_text.html#profile_text) | hdfs:text | n/a | Read, Write | -| HDFS | delimited single line comma-separated values of [text](hdfs_text.html#profile_text) | hdfs:csv | n/a | Read, Write | -| HDFS | multi-byte or multi-character delimited single line [csv](hdfs_text.html#multibyte_delim) | hdfs:csv | n/a | Read | -| HDFS | fixed width single line [text](hdfs_fixedwidth.html) | hdfs:fixedwidth | n/a | Read, Write | -| HDFS | delimited [text with quoted linefeeds](hdfs_text.html#profile_textmulti) | hdfs:text:multi | n/a | Read | -| HDFS | [Avro](hdfs_avro.html) | hdfs:avro | n/a | Read, Write | -| HDFS | [JSON](hdfs_json.html) | hdfs:json | n/a | Read, Write | -| HDFS | [ORC](hdfs_orc.html) | hdfs:orc | n/a | Read, Write | -| HDFS | [Parquet](hdfs_parquet.html) | hdfs:parquet | n/a | Read, Write | -| HDFS | AvroSequenceFile | hdfs:AvroSequenceFile | n/a | Read, Write | -| HDFS | [SequenceFile](hdfs_seqfile.html) | hdfs:SequenceFile | n/a | Read, Write | -| [Hive](hive_pxf.html) | stored as TextFile | hive, [hive:text] (hive_pxf.html#hive_text) | Hive, HiveText | Read | -| [Hive](hive_pxf.html) | stored as SequenceFile | hive | Hive | Read | -| [Hive](hive_pxf.html) | stored as RCFile | hive, [hive:rc](hive_pxf.html#hive_hiverc) | Hive, HiveRC | Read | -| [Hive](hive_pxf.html) | stored as ORC | hive, [hive:orc](hive_pxf.html#hive_orc) | Hive, HiveORC, HiveVectorizedORC | Read | -| [Hive](hive_pxf.html) | stored as Parquet | hive | Hive | Read | -| [Hive](hive_pxf.html) | stored as Avro | hive | Hive | Read | -| [HBase](hbase_pxf.html) | Any | hbase | HBase | Read | +| HDFS | delimited single line [text](hdfs_text.html#profile_text) | hdfs:text | text | Read, Write | +| HDFS | delimited single line comma-separated values of [text](hdfs_text.html#profile_text) | hdfs:csv | csv | Read, Write | +| HDFS | multi-byte or multi-character delimited single line [csv](hdfs_text.html#multibyte_delim) | hdfs:csv | csv | Read | +| HDFS | fixed width single line [text](hdfs_fixedwidth.html) | hdfs:fixedwidth | | Read, Write | +| HDFS | delimited [text with quoted linefeeds](hdfs_text.html#profile_textmulti) | hdfs:text:multi | text:multi | Read | +| HDFS | [Avro](hdfs_avro.html) | hdfs:avro | avro | Read, Write | +| HDFS | [JSON](hdfs_json.html) | hdfs:json | json | Read, Write | +| HDFS | [ORC](hdfs_orc.html) | hdfs:orc | orc | Read, Write | +| HDFS | [Parquet](hdfs_parquet.html) | hdfs:parquet | parquet | Read, Write | +| HDFS | AvroSequenceFile | hdfs:AvroSequenceFile | AvroSequenceFile | Read, Write | +| HDFS | [SequenceFile](hdfs_seqfile.html) | hdfs:SequenceFile | SequenceFile | Read, Write | +| [Hive](hive_pxf.html) | stored as TextFile | hive, [hive:text](hive_pxf.html#hive_text) | | Read | +| [Hive](hive_pxf.html) | stored as SequenceFile | hive | | Read | +| [Hive](hive_pxf.html) | stored as RCFile | hive, [hive:rc](hive_pxf.html#hive_hiverc) | | Read | +| [Hive](hive_pxf.html) | stored as ORC | hive, [hive:orc](hive_pxf.html#hive_orc) | orc | Read | +| [Hive](hive_pxf.html) | stored as Parquet | hive | | Read | +| [Hive](hive_pxf.html) | stored as Avro | hive | | Read | +| [HBase](hbase_pxf.html) | Any | hbase | - | Read | ### Choosing the Profile @@ -143,12 +143,29 @@ When accessing ORC-format data: Choose the `hdfs:parquet` profile when the file is Parquet, you know the location of the file in the HDFS file system, and you want to take advantage of extended filter pushdown support for additional data types and operators. -### Specifying the Profile +### Specifying the Profile for External Tables -You must provide the profile name when you specify the `pxf` protocol in a `CREATE EXTERNAL TABLE` command to create a Greenplum Database external table that references a Hadoop file or directory, HBase table, or Hive table. For example, the following command creates an external table that uses the default server and specifies the profile named `hdfs:text` to access the HDFS file `/data/pxf_examples/pxf_hdfs_simple.txt`: +You must provide the profile name when you specify the `pxf` protocol in a `CREATE EXTERNAL TABLE` command to create a Apache Cloudberry external table that references a Hadoop file or directory, HBase table, or Hive table. For example, the following command creates an external table that uses the default server and specifies the profile named `hdfs:text` to access the HDFS file `/data/pxf_examples/pxf_hdfs_simple.txt`: ``` sql CREATE EXTERNAL TABLE pxf_hdfs_text(location text, month text, num_orders int, total_sales float8) LOCATION ('pxf://data/pxf_examples/pxf_hdfs_simple.txt?PROFILE=hdfs:text') FORMAT 'TEXT' (delimiter=E','); ``` + +### Specifying the Profile for Foreign Tables + +When you use the `hdfs_pxf_fdw`, `hive_pxf_fdw`, or `hbase_pxf_fdw` foreign data wrapper in a `CREATE FOREIGN TABLE` command, you must specify a server name you configuredin Prerequisites section above. The foreign table can reference a Hadoop file or directory, an HBase table, or a Hive table. For example, the following commands create a foreign server named `hadoop_server` with the `hdfs_pxf_fdw` foreign data wrapper, then create a foreign table that uses the `text` format to access the HDFS file `data/pxf_examples/pxf_hdfs_simple.txt`: + +``` sql +CREATE SERVER hadoop_server FOREIGN DATA WRAPPER hdfs_pxf_fdw; +CREATE USER MAPPING FOR CURRENT_USER SERVER hadoop_server; + +CREATE FOREIGN TABLE pxf_parquet_s3 (location text, month text, num_orders int, total_sales float8) +SERVER hadoop_server +OPTIONS ( + resource 'data/pxf_examples/pxf_hdfs_simple.txt', + format 'text', + delimiter=E',' +) +``` diff --git a/docs/content/hdfs_parquet.html.md.erb b/docs/content/hdfs_parquet.html.md.erb index 9ad05b785..86bed745d 100644 --- a/docs/content/hdfs_parquet.html.md.erb +++ b/docs/content/hdfs_parquet.html.md.erb @@ -23,7 +23,7 @@ under the License. Use the PXF HDFS connector to read and write Parquet-format data. This section describes how to read and write HDFS files that are stored in Parquet format, including how to create, query, and insert into external tables that reference files in the HDFS data store. -PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, and `zstd`. +PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, 'lz4_raw' and `zstd`. PXF currently supports reading and writing primitive Parquet data types only. @@ -35,7 +35,7 @@ Ensure that you have met the PXF Hadoop [Prerequisites](access_hdfs.html#hadoop_ ## Data Type Mapping -To read and write Parquet primitive data types in Greenplum Database, map Parquet data values to Greenplum Database columns of the same type. +To read and write Parquet primitive data types in Apache Cloudberry, map Parquet data values to Apache Cloudberry columns of the same type. Parquet supports a small set of primitive data types, and uses metadata annotations to extend the data types that it supports. These annotations specify how to interpret the primitive type. For example, Parquet stores both `INTEGER` and `DATE` types as the `INT32` primitive type. An annotation identifies the original type as a `DATE`. @@ -45,7 +45,7 @@ Parquet supports a small set of primitive data types, and uses metadata annotati PXF uses the following data type mapping when reading Parquet data: -| Parquet Physical Type | Parquet Logical Type | PXF/Greenplum Data Type | +| Parquet Physical Type | Parquet Logical Type | PXF/Cloudberry Data Type | |-------------------|---------------|--------------------------| | boolean | -- | Boolean | | binary \(byte\_array\) | -- | Bytea | @@ -67,7 +67,7 @@ PXF uses the following data type mapping when reading Parquet data: PXF can read a Parquet `LIST` nested type when it represents a one-dimensional array of certain Parquet types. The supported mappings follow: -| Parquet Data Type | PXF/Greenplum Data Type | +| Parquet Data Type | PXF/Cloudberry Data Type | |-------------------|-------------------------| | list of \ | Boolean[] | | list of \ | Bytea[] | @@ -90,7 +90,7 @@ PXF can read a Parquet `LIST` nested type when it represents a one-dimensional a PXF uses the following data type mapping when writing Parquet data: -| PXF/Greenplum Data Type | Parquet Physical Type | Parquet Logical Type | +| PXF/Cloudberry Data Type | Parquet Physical Type | Parquet Logical Type | |-------------------|---------------|--------------------------| | Bigint | int64 | -- | | Boolean | boolean | -- | @@ -114,7 +114,7 @@ PXF uses the following data type mapping when writing Parquet data: PXF can write a one-dimensional `LIST` of certain Parquet data types. The supported mappings follow: -| PXF/Greenplum Data Type | Parquet Data Type | +| PXF/Cloudberry Data Type | Parquet Data Type | |-------------------|--------------------------| | Bigint[] | list of \ | | Boolean[] | list of \ | @@ -149,7 +149,7 @@ When you provide the Parquet schema file to PXF, you must specify the absolute p The PXF HDFS connector `hdfs:parquet` profile supports reading and writing HDFS data in Parquet-format. When you insert records into a writable external table, the block(s) of data that you insert are written to one or more files in the directory that you specified. -Use the following syntax to create a Greenplum Database external table that references an HDFS directory: +Use the following syntax to create a Apache Cloudberry external table that references an HDFS directory: ``` sql CREATE [WRITABLE] EXTERNAL TABLE @@ -160,7 +160,7 @@ FORMAT 'CUSTOM' (FORMATTER='pxfwritable_import'|'pxfwritable_export') [DISTRIBUTED BY ( [, ... ] ) | DISTRIBUTED RANDOMLY]; ``` -The specific keywords and values used in the Greenplum Database [CREATE EXTERNAL TABLE](https://docs.vmware.com/en/VMware-Greenplum/6/greenplum-database/ref_guide-sql_commands-CREATE_EXTERNAL_TABLE.html) command are described in the table below. +The specific keywords and values used in the Apache Cloudberry [CREATE EXTERNAL TABLE](https://cloudberry.apache.org/docs/sql-stmts/create-external-table/) command are described in the table below. | Keyword | Value | |-------|-------------------------------------| @@ -169,10 +169,36 @@ The specific keywords and values used in the Greenplum Database [CREATE EXTERNAL | SERVER=\ | The named server configuration that PXF uses to access the data. PXF uses the `default` server if not specified. | | \ | \s are described below.| | FORMAT 'CUSTOM' | Use `FORMAT` '`CUSTOM`' with `(FORMATTER='pxfwritable_export')` (write) or `(FORMATTER='pxfwritable_import')` (read). | -| DISTRIBUTED BY | If you want to load data from an existing Greenplum Database table into the writable external table, consider specifying the same distribution policy or `` on both tables. Doing so will avoid extra motion of data between segments on the load operation. | +| DISTRIBUTED BY | If you want to load data from an existing Apache Cloudberry table into the writable external table, consider specifying the same distribution policy or `` on both tables. Doing so will avoid extra motion of data between segments on the load operation. | + + +## Creating the Foreign Table + +The PXF HDFS `hdfs_pxf_fdw` foreign data wrapper supports reading and writing Parquet-formatted HDFS files. When you insert records into a foreign table, the block(s) of data that you insert are written to one file per segment in the directory that you specified in the `resource` clause. + +Use the following syntax to create an Apache Cloudberry foreign table that references an HDFS file or directory: + +``` sql +CREATE SERVER FOREIGN DATA WRAPPER hdfs_pxf_fdw; +CREATE USER MAPPING FOR SERVER ; + +CREATE FOREIGN TABLE [ IF NOT EXISTS ] + ( [, ...] | LIKE ) + SERVER + OPTIONS ( resource '', format 'parquet' [, ''[...]]); +``` + +The specific keywords and values used in the Apache Cloudberry [CREATE FOREIGN TABLE](https://cloudberry.apache.org/docs/sql-stmts/create-foreign-table) command are described below. + +| Keyword | Value | +|-------|-------------------------------------| +| \ | The named server configuration that PXF uses to access the data. You can override credentials in `CREATE SERVER` statement as described in [Overriding the S3 Server Configuration for Foreign Tables](access_s3.html#s3_override_fdw) | +| \ | The path to the directory in the HDFS data store. When the `` configuration includes a [`pxf.fs.basePath`](cfg_server.html#pxf-fs-basepath) property setting, PXF considers \ to be relative to the base path specified. Otherwise, PXF considers it to be an absolute path. \ must not specify a relative path nor include the dollar sign (`$`) character. | +| format | The file format; specify `'parquet'` for Parquet-formatted data. | +| \ | \s are described below. | -The PXF `hdfs:parquet` profile supports the following read option. You specify this option in the `CREATE EXTERNAL TABLE` `LOCATION` clause: +The PXF `hdfs:parquet` profile supports the following read option: | Read Option | Value Description | |-------|-------------------------------------| @@ -182,13 +208,13 @@ The PXF `hdfs:parquet` profile supports encoding- and compression-related write | Write Option | Value Description | |-------|-------------------------------------| -| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `zstd`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | +| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `lz4_raw`, `zstd`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | | ROWGROUP_SIZE | A Parquet file consists of one or more row groups, a logical partitioning of the data into rows. `ROWGROUP_SIZE` identifies the size (in bytes) of the row group. The default row group size is `8 * 1024 * 1024` bytes. | | PAGE_SIZE | A row group consists of column chunks that are divided up into pages. `PAGE_SIZE` is the size (in bytes) of such a page. The default page size is `1 * 1024 * 1024` bytes. | | ENABLE\_DICTIONARY | A boolean value that specifies whether or not to enable dictionary encoding. The default value is `true`; dictionary encoding is enabled when PXF writes Parquet files. | | DICTIONARY\_PAGE\_SIZE | When dictionary encoding is enabled, there is a single dictionary page per column, per row group. `DICTIONARY_PAGE_SIZE` is similar to `PAGE_SIZE`, but for the dictionary. The default dictionary page size is `1 * 1024 * 1024` bytes. | | PARQUET_VERSION | The Parquet version; PXF supports the values `v1` and `v2` for this option. The default Parquet version is `v1`. | -| SCHEMA | The absolute path to the Parquet schema file on the Greenplum host or on HDFS. | +| SCHEMA | The absolute path to the Parquet schema file on the Cloudberry PXF host or on HDFS. | **Note**: You must explicitly specify `uncompressed` if you do not want PXF to compress the data. @@ -208,12 +234,29 @@ This example utilizes the data schema introduced in [Example: Reading Text Data In this example, you create a Parquet-format writable external table that uses the default PXF server to reference Parquet-format data in HDFS, insert some data into the table, and then create a readable external table to read the data. -1. Use the `hdfs:parquet` profile to create a writable external table. For example: +1. Apache Cloudberry does not support both reading and writing single external table. Create two table - one for read and one for write referencing same HDFS directory: ``` sql postgres=# CREATE WRITABLE EXTERNAL TABLE pxf_tbl_parquet (location text, month text, number_of_orders int, item_quantity_per_order int[], total_sales double precision) LOCATION ('pxf://data/pxf_examples/pxf_parquet?PROFILE=hdfs:parquet') FORMAT 'CUSTOM' (FORMATTER='pxfwritable_export'); + + postgres=# CREATE EXTERNAL TABLE read_pxf_parquet(location text, month text, number_of_orders int, item_quantity_per_order int[], total_sales double precision) + LOCATION ('pxf://data/pxf_examples/pxf_parquet?PROFILE=hdfs:parquet') + FORMAT 'CUSTOM' (FORMATTER='pxfwritable_import'); + ``` + + OR create single foreign table to read and write operations: + + ``` + testdb=# CREATE SERVER example_parquet FOREIGN DATA WRAPPER hdfs_pxf_fdw; + testdb=# CREATE USER MAPPING FOR CURRENT_USER SERVER example_parquet; + testdb=# CREATE FOREIGN TABLE pxf_tbl_parquet(location text, month text, number_of_orders int, item_quantity_per_order int[], total_sales double precision) + SERVER example_parquet + OPTIONS ( + resource 'data/pxf_examples/pxf_parquet', + format 'parquet' + ); ``` 2. Write a few records to the `pxf_parquet` HDFS directory by inserting directly into the `pxf_tbl_parquet` table. For example: @@ -223,20 +266,24 @@ In this example, you create a Parquet-format writable external table that uses t postgres=# INSERT INTO pxf_tbl_parquet VALUES ( 'Cleveland', 'Oct', 2, '{3333,7777}', 96645.37 ); ``` -3. Recall that Greenplum Database does not support directly querying a writable external table. To read the data in `pxf_parquet`, create a readable external Greenplum Database referencing this HDFS directory: +3. Query the readable external table `read_pxf_parquet`: ``` sql - postgres=# CREATE EXTERNAL TABLE read_pxf_parquet(location text, month text, number_of_orders int, item_quantity_per_order int[], total_sales double precision) - LOCATION ('pxf://data/pxf_examples/pxf_parquet?PROFILE=hdfs:parquet') - FORMAT 'CUSTOM' (FORMATTER='pxfwritable_import'); + postgres=# SELECT * FROM read_pxf_parquet ORDER BY total_sales; + ``` + ``` pre + location | month | number_of_orders | item_quantity_per_order | total_sales + -----------+-------+------------------+-------------------------+------------- + Frankfurt | Mar | 777 | {1,11,111} | 3956.98 + Cleveland | Oct | 3812 | {3333,7777} | 96645.4 + (2 rows) ``` -4. Query the readable external table `read_pxf_parquet`: + OR query the same foreign table `pxf_tbl_parquet`: ``` sql - postgres=# SELECT * FROM read_pxf_parquet ORDER BY total_sales; + postgres=# SELECT * FROM pxf_tbl_parquet ORDER BY total_sales; ``` - ``` pre location | month | number_of_orders | item_quantity_per_order | total_sales -----------+-------+------------------+-------------------------+------------- diff --git a/docs/content/hive_pxf.html.md.erb b/docs/content/hive_pxf.html.md.erb index 3884b12c0..4b470c744 100644 --- a/docs/content/hive_pxf.html.md.erb +++ b/docs/content/hive_pxf.html.md.erb @@ -335,7 +335,7 @@ Use the `hive:rc` profile to query RCFile-formatted data in a Hive table. ## Accessing ORC-Format Hive Tables -The Optimized Row Columnar (ORC) file format is a columnar file format that provides a highly efficient way to both store and access HDFS data. ORC format offers improvements over text and RCFile formats in terms of both compression and performance. PXF supports ORC version 1.2.1. +The Optimized Row Columnar (ORC) file format is a columnar file format that provides a highly efficient way to both store and access HDFS data. ORC format offers improvements over text and RCFile formats in terms of both compression and performance. ORC is type-aware and specifically designed for Hadoop workloads. ORC files store both the type of and encoding information for the data in the file. All columns within a single group of row data (also known as stripe) are stored together on disk in ORC format files. The columnar nature of the ORC format type enables read projection, helping avoid accessing unnecessary columns during a query. diff --git a/docs/content/index.html.md.erb b/docs/content/index.html.md.erb index 480ee7e91..aa7c1cc39 100644 --- a/docs/content/index.html.md.erb +++ b/docs/content/index.html.md.erb @@ -21,7 +21,7 @@ specific language governing permissions and limitations under the License. --> -The Greenplum Platform Extension Framework (PXF) provides parallel, high throughput data access and federated queries across heterogeneous data sources via built-in connectors that map a Greenplum Database external table definition to an external data source. PXF has its roots in the Apache HAWQ project. +The Apache Cloudberry Platform Extension Framework (PXF) provides parallel, high throughput data access and federated queries across heterogeneous data sources via built-in connectors that map a Greenplum Database external table definition to an external data source. PXF has its roots in the Apache HAWQ project. - [Overview of PXF](overview_pxf.html) - [Transitioning to Apache Cloudberry](transition_to_cloudberry.html) diff --git a/docs/content/instcfg_pxf.html.md.erb b/docs/content/instcfg_pxf.html.md.erb index 4b7a0a0d8..af9a4811c 100644 --- a/docs/content/instcfg_pxf.html.md.erb +++ b/docs/content/instcfg_pxf.html.md.erb @@ -1,7 +1,7 @@ --- title: Configuring PXF --- -Your Greenplum Database deployment consists of a coordinator host, a standby coordinator host, and multiple segment hosts. After you configure the Greenplum Platform Extension Framework (PXF), you start a single PXF JVM process (PXF Service) on each Greenplum Database host. +Your Greenplum Database deployment consists of a coordinator host, a standby coordinator host, and multiple segment hosts. After you configure the Apache Cloudberry Platform Extension Framework (PXF), you start a single PXF JVM process (PXF Service) on each Greenplum Database host. PXF provides connectors to Hadoop, Hive, HBase, object stores, network file systems, and external SQL data stores. You must configure PXF to support the connectors that you plan to use. diff --git a/docs/content/intro_pxf.html.md.erb b/docs/content/intro_pxf.html.md.erb index 59c2ec7cf..6f0c44c69 100644 --- a/docs/content/intro_pxf.html.md.erb +++ b/docs/content/intro_pxf.html.md.erb @@ -2,7 +2,7 @@ title: Introduction to PXF --- -The Greenplum Platform Extension Framework (PXF) provides *connectors* that enable you to access data stored in sources external to your Greenplum Database deployment. These connectors map an external data source to a Greenplum Database *external table* definition. When you create the Greenplum Database external table, you identify the external data store and the format of the data via a *server* name and a *profile* name that you provide in the command. +The Apache Cloudberry Platform Extension Framework (PXF) provides *connectors* that enable you to access data stored in sources external to your Greenplum Database deployment. These connectors map an external data source to a Greenplum Database *external table* definition. When you create the Greenplum Database external table, you identify the external data store and the format of the data via a *server* name and a *profile* name that you provide in the command. You can query the external table via Greenplum Database, leaving the referenced data in place. Or, you can use the external table to load the data into Greenplum Database for higher performance. diff --git a/docs/content/objstore_parquet.html.md.erb b/docs/content/objstore_parquet.html.md.erb index e0c1f1cb3..50cc26a26 100644 --- a/docs/content/objstore_parquet.html.md.erb +++ b/docs/content/objstore_parquet.html.md.erb @@ -32,7 +32,7 @@ Ensure that you have met the PXF Object Store [Prerequisites](access_objstore.ht ## Data Type Mapping -Refer to [Data Type Mapping](hdfs_parquet.html#datatype_map) in the PXF HDFS Parquet documentation for a description of the mapping between Greenplum Database and Parquet data types. +Refer to [Data Type Mapping](hdfs_parquet.html#datatype_map) in the PXF HDFS Parquet documentation for a description of the mapping between Apache Cloudberry and Parquet data types. ## Creating the External Table @@ -47,7 +47,7 @@ The PXF `:parquet` profiles support reading and writing data in Parque | S3 | s3 | -Use the following syntax to create a Greenplum Database external table that references an HDFS directory. When you insert records into a writable external table, the block(s) of data that you insert are written to one or more files in the directory that you specified. +Use the following syntax to create a Apache Cloudberry external table that references an HDFS directory. When you insert records into a writable external table, the block(s) of data that you insert are written to one or more files in the directory that you specified. ``` sql CREATE [WRITABLE] EXTERNAL TABLE @@ -58,7 +58,7 @@ FORMAT 'CUSTOM' (FORMATTER='pxfwritable_import'|'pxfwritable_export') [DISTRIBUTED BY ( [, ... ] ) | DISTRIBUTED RANDOMLY]; ``` -The specific keywords and values used in the Greenplum Database [CREATE EXTERNAL TABLE](https://docs.vmware.com/en/VMware-Greenplum/6/greenplum-database/ref_guide-sql_commands-CREATE_EXTERNAL_TABLE.html) command are described in the table below. +The specific keywords and values used in the Apache Cloudberry [CREATE EXTERNAL TABLE](https://cloudberry.apache.org/docs/sql-stmts/create-external-table/) command are described in the table below. | Keyword | Value | |-------|-------------------------------------| @@ -67,30 +67,70 @@ The specific keywords and values used in the Greenplum Database [CREATE EXTERNAL | SERVER=\ | The named server configuration that PXF uses to access the data. | | \=\ | Parquet-specific custom options are described in the [PXF HDFS Parquet documentation](hdfs_parquet.html#customopts). | | FORMAT 'CUSTOM' | Use `FORMAT` '`CUSTOM`' with `(FORMATTER='pxfwritable_export')` (write) or `(FORMATTER='pxfwritable_import')` (read). | -| DISTRIBUTED BY | If you want to load data from an existing Greenplum Database table into the writable external table, consider specifying the same distribution policy or `` on both tables. Doing so will avoid extra motion of data between segments on the load operation. | +| DISTRIBUTED BY | If you want to load data from an existing Apache Cloudberry table into the writable external table, consider specifying the same distribution policy or `` on both tables. Doing so will avoid extra motion of data between segments on the load operation. | If you are accessing an S3 object store: - You can provide S3 credentials via custom options in the `CREATE EXTERNAL TABLE` command as described in [Overriding the S3 Server Configuration for External Tables DDL](access_s3.html#s3_override_ext). - If you are reading Parquet data from S3, you can direct PXF to use the S3 Select Amazon service to retrieve the data. Refer to [Using the Amazon S3 Select Service](access_s3.html#s3_select) for more information about the PXF custom option used for this purpose. +## Creating the Foreign Table + +Use one of the following foreign data wrappers with `format 'parquet'`. + +| Object Store | Foreign Data Wrapper | +|-------|-------------------------------------| +| Azure Blob Storage | wasbs_pxf_fdw | +| Azure Data Lake Storage Gen2 | abfss_pxf_fdw | +| Google Cloud Storage | gs_pxf_fdw | +| MinIO | s3_pxf_fdw | +| S3 | s3_pxf_fdw | + +The following syntax creates a Apache Cloudberry foreign table that references an Parquet-format file: + +``` sql +CREATE SERVER FOREIGN DATA WRAPPER _pxf_fdw; +CREATE USER MAPPING FOR SERVER ; + +CREATE FOREIGN TABLE [ IF NOT EXISTS ] + ( [, ...] | LIKE ) + SERVER + OPTIONS ( resource '', format 'parquet' [, '' [, ...] ]); +``` + +| Keyword | Value | +|-------|-------------------------------------| +| \ | The named server configuration that PXF uses to access the data. You can override credentials in `CREATE SERVER` statement as described in [Overriding the S3 Server Configuration for Foreign Tables](access_s3.html#s3_override_fdw) | +| resource \ | The path to the directory or file in the object store. When the `` configuration includes a [`pxf.fs.basePath`](cfg_server.html#pxf-fs-basepath) property setting, PXF considers \ to be relative to the base path specified. Otherwise, PXF considers it to be an absolute path. \ must not specify a relative path nor include the dollar sign (`$`) character. | +| format 'parquet' | The file format; specify `'parquet'` for Parquet-formatted data. | +| \=\ | parquet-specific custom options are described in the [PXF HDFS parquet documentation](hdfs_parquet.html#customopts). | + + ## Example Refer to the [Example](hdfs_parquet.html#parquet_write) in the PXF HDFS Parquet documentation for a Parquet write/read example. Modifications that you must make to run the example with an object store include: -- Using the `CREATE WRITABLE EXTERNAL TABLE` syntax and `LOCATION` keywords and settings described above for the writable external table. For example, if your server name is `s3srvcfg`: +- Using the `CREATE WRITABLE EXTERNAL TABLE` syntax and `LOCATION` keywords and settings described above for the writable and readable external tables. For example, if your server name is `s3srvcfg`: ``` sql CREATE WRITABLE EXTERNAL TABLE pxf_tbl_parquet_s3 (location text, month text, number_of_orders int, item_quantity_per_order int[], total_sales double precision) LOCATION ('pxf://BUCKET/pxf_examples/pxf_parquet?PROFILE=s3:parquet&SERVER=s3srvcfg') FORMAT 'CUSTOM' (FORMATTER='pxfwritable_export'); - ``` - -- Using the `CREATE EXTERNAL TABLE` syntax and `LOCATION` keywords and settings described above for the readable external table. For example, if your server name is `s3srvcfg`: - ``` sql CREATE EXTERNAL TABLE read_pxf_parquet_s3(location text, month text, number_of_orders int, item_quantity_per_order int[], total_sales double precision) LOCATION ('pxf://BUCKET/pxf_examples/pxf_parquet?PROFILE=s3:parquet&SERVER=s3srvcfg') FORMAT 'CUSTOM' (FORMATTER='pxfwritable_import'); ``` +- Using the `CREATE FOREIGN TABLE` syntax and settings described above for the foreign table. For example, if your server name is `s3srvcfg`: + ``` sql + CREATE SERVER s3srvcfg FOREIGN DATA WRAPPER s3_pxf_fdw; + CREATE USER MAPPING FOR CURRENT_USER SERVER s3srvcfg; + + CREATE FOREIGN TABLE pxf_parquet_s3 (location text, month text, number_of_orders int, item_quantity_per_order int[], total_sales double precision) + SERVER s3srvcfg + OPTIONS ( + resource 'BUCKET/pxf_examples/pxf_parquet', + format 'parquet' + ) + ``` \ No newline at end of file diff --git a/docs/content/overview_pxf.html.md.erb b/docs/content/overview_pxf.html.md.erb index a1b430958..05517852b 100644 --- a/docs/content/overview_pxf.html.md.erb +++ b/docs/content/overview_pxf.html.md.erb @@ -1,5 +1,5 @@ --- -title: Greenplum Platform Extension Framework (PXF) +title: Apache Cloudberry Platform Extension Framework (PXF) --- + The transition of the PXF project to **Apache Cloudberry (Incubating)** involves a significant rebranding effort. As part of this transition, the Java package namespace has been changed from `org.greenplum` to `org.apache.cloudberry`. This is a user-facing breaking change. If you have customized PXF configuration files in your `$PXF_BASE/conf` directory, you must manually update these files to use the new package names. diff --git a/docs/content/using_pxf.html.md.erb b/docs/content/using_pxf.html.md.erb index 96dea3335..82094676d 100644 --- a/docs/content/using_pxf.html.md.erb +++ b/docs/content/using_pxf.html.md.erb @@ -21,7 +21,7 @@ specific language governing permissions and limitations under the License. --> -The Greenplum Platform Extension Framework (PXF) implements a protocol named `pxf` that you can use to create an external table that references data in an external data store. The PXF protocol and Java service are packaged as a Greenplum Database extension. +The Apache Cloudberry Platform Extension Framework (PXF) implements a protocol named `pxf` that you can use to create an external table that references data in an external data store. The PXF protocol and Java service are packaged as a Greenplum Database extension. You must enable the PXF extension in each database in which you plan to use the framework to access external data. You must also explicitly `GRANT` permission to the `pxf` protocol to those users/roles who require access. diff --git a/external-table/Makefile b/external-table/Makefile index 55ba4d924..ae5195dcd 100644 --- a/external-table/Makefile +++ b/external-table/Makefile @@ -18,11 +18,14 @@ include $(PGXS) .PHONY: stage stage: pxf.so mkdir -p build/stage/gpextable + mkdir -p build/metadata install -c -m 755 pxf.so build/stage/gpextable/pxf.so install -c -m 644 pxf.control build/stage/gpextable/ install -c -m 644 $(DATA) build/stage/gpextable/ - @echo "cloudberry.version=$(CLB_VERSION)" > build/stage/gpextable/metadata - @echo "cloudberry.major-version=$(CLB_MAJORVERSION)" >> build/stage/gpextable/metadata + @echo "$(GP_MAJORVERSION)" > build/metadata/gp_major_version + @echo "$(shell uname -m)" > build/metadata/build_arch + @echo "cloudberry.version=$(GP_VERSION)" > build/stage/gpextable/metadata + @echo "cloudberry.major-version=$(GP_MAJORVERSION)" >> build/stage/gpextable/metadata .PHONY: clean-all clean-all: clean diff --git a/package/DEBIAN/conffiles b/package/DEBIAN/conffiles index 622d6eba5..a2f459ca8 100644 --- a/package/DEBIAN/conffiles +++ b/package/DEBIAN/conffiles @@ -1,4 +1,4 @@ -/usr/local/pxf-gp6/conf/pxf-application.properties -/usr/local/pxf-gp6/conf/pxf-env.sh -/usr/local/pxf-gp6/conf/pxf-log4j2.xml -/usr/local/pxf-gp6/conf/pxf-profiles.xml +/usr/local/cloudberry-pxf/conf/pxf-application.properties +/usr/local/cloudberry-pxf/conf/pxf-env.sh +/usr/local/cloudberry-pxf/conf/pxf-log4j2.xml +/usr/local/cloudberry-pxf/conf/pxf-profiles.xml diff --git a/package/DEBIAN/control b/package/DEBIAN/control index e3c10ee76..4bb36f8b0 100644 --- a/package/DEBIAN/control +++ b/package/DEBIAN/control @@ -1,5 +1,5 @@ -Package: pxf-gp6 +Package: apache-cloudberry-pxf-incubating Version: %VERSION% -Architecture: amd64 +Architecture: %ARCH% Maintainer: %MAINTAINER% -Description: Greenplum PXF framework for external data access +Description: Apache Cloudberry PXF (Platform Extension Framework) for advanced data access diff --git a/package/DEBIAN/postinst b/package/DEBIAN/postinst index 8d38c996c..35578858e 100755 --- a/package/DEBIAN/postinst +++ b/package/DEBIAN/postinst @@ -1,4 +1,4 @@ #!/bin/sh -sed -i "s|directory =.*|directory = '/usr/local/pxf-gp6/gpextable/'|g" /usr/local/pxf-gp6/gpextable/pxf.control -sed -i "s|module_pathname =.*|module_pathname = '/usr/local/pxf-gp6/gpextable/pxf'|g" /usr/local/pxf-gp6/gpextable/pxf.control \ No newline at end of file +sed -i "s|directory =.*|directory = '/usr/local/cloudberry-pxf/gpextable/'|g" "/usr/local/cloudberry-pxf/gpextable/pxf.control" +sed -i "s|module_pathname =.*|module_pathname = '/usr/local/cloudberry-pxf/gpextable/pxf'|g" "/usr/local/cloudberry-pxf/gpextable/pxf.control" \ No newline at end of file diff --git a/package/DEBIAN/prerm b/package/DEBIAN/prerm index 1ec74420e..1c3d2d085 100755 --- a/package/DEBIAN/prerm +++ b/package/DEBIAN/prerm @@ -1,5 +1,5 @@ #!/bin/sh -rm -f /usr/local/pxf-gp6/conf/pxf-private.classpath -rm -rf /usr/local/pxf-gp6/pxf-service -rm -rf /usr/local/pxf-gp6/run +rm -f /usr/local/cloudberry-pxf/conf/pxf-private.classpath +rm -rf /usr/local/cloudberry-pxf/pxf-service +rm -rf /usr/local/cloudberry-pxf/run diff --git a/package/README.md b/package/README.md index 898ea8b1e..e235b0a19 100644 --- a/package/README.md +++ b/package/README.md @@ -1,60 +1,60 @@ PXF Packaging ============ -PXF consists of 3 groups of artifacts, each developed using a different underlying technology: +Apache Cloudberry PXF (Platform Extension Framework) consists of 3 groups of artifacts, each developed using a different underlying technology: -* Greenplum extension -- written in C; when built, produces a `pxf.so` library and configuration files +* Apache Cloudberry extension -- written in C; when built, produces a `pxf.so` library and configuration files * PXF Server -- written in Java; when built, produces a `pxf.war` file, Tomcat server, dependent JAR files, templates and scripts * Script Cluster Plugin -- written in Go; when built, produces a `pxf-cli` executable -The PXF build system can create an RPM package on CentOs platform and a DEB package on Ubuntu platform, -respectively. PXF compiles against and generates a different package for every major Greenplum version. +The PXF build system can create an RPM package on CentOS platform and a DEB package on Ubuntu platform, +respectively. PXF compiles against and generates packages for Apache Cloudberry. -For example, `pxf-gp5-1.2.3-1.el7.x86_64.rpm` represents an RPM package of PXF version 1.2.3 intended to work with -Greenplum 5 on Centos / Redhat 7 operating systems. +For example, `apache-cloudberry-pxf-incubating-1.2.3-1.el7.x86_64.rpm` represents an RPM package of PXF version 1.2.3 intended to work with +Apache Cloudberry on CentOS / Red Hat 7 operating systems. ## PXF RPM specification -On Centos platforms PXF product is packaged as an RPM. The specification on how to build the RPM is provided by the -`pxf-gpX.spec` files in this directory. The following key design decisions were made: +On CentOS platforms PXF product is packaged as an RPM. The specification on how to build the RPM is provided by the +`cloudberry-pxf.spec` file in this directory. The following key design decisions were made: -* the name of the RPM package is `pxf-gpX`, where X is the major Greenplum version (e.g. `pxf-gp5`, `pxf-gp6`) -* to install a newer RPM package for the same Greenplum major release, a user will have to upgrade the PXF RPM -* the RPM installs PXF server into `/usr/local/pxf-gpX` directory (e.g. `/usr/local/pxf-gp6`) +* the name of the RPM package is `apache-cloudberry-pxf-incubating` +* to install a newer RPM package, a user will have to upgrade the PXF RPM +* the RPM installs PXF server into `/usr/local/cloudberry-pxf-[VERSION]` directory (e.g. `/usr/local/cloudberry-pxf-1.2.3`) * the RPM is relocatable, a user can specify --prefix option when installing the RPM to install the server into another directory -* the PXF greenplum extension is initially installed by RPM alongside the PXF server and is not initially active -* the PXF greenplum extension is copied into Greenplum install location during `pxf init` command issued by a user after the install +* the PXF Apache Cloudberry extension is initially installed by RPM alongside the PXF server and is not initially active +* the PXF Apache Cloudberry extension is copied into Cloudberry install location during `pxf init` command issued by a user after the install * the PXF RPM version number follows 3-number semantic versioning and must be provided during the RPM build process * the PXF RPM release number is usually specified as `1` -* example PXF RPM names are : `pxf-gp5-1.2.3-1.el6.x86_64.rpm` and `pxf-gp5-1.2.3-1.el7.x86_64.rpm` +* example PXF RPM names are : `apache-cloudberry-pxf-incubating-1.2.3-1.el7.x86_64.rpm` and `apache-cloudberry-pxf-incubating-1.2.3-1.el8.x86_64.rpm` ## PXF RPM build process To build an RPM, follow these steps: 1. Install the `rpm-build` package: `sudo yum install rpm-build` -2. Install Greenplum database -3. Run `source $GPHOME/greenplum_path.sh` to configure your `PATH` to be able to find `pg_config` program +2. Install Apache Cloudberry +3. Run `source $GPHOME/greenplum_path.sh`(for Cloudberry 2.0) or `source $GPHOME/cloudberry-env.sh` (for Cloudberry 2.1+) to configure your `PATH` to be able to find `pg_config` program 4. Run `make clean rpm` from the top-level directory to build artifacts and assemble the RPM 5. The RPM will be available in `build/rpmbuild/RPMS` directory ## PXF RPM installation process To install PXF from an RPM, follow these steps: -1. Build or download PXF RPM for the corresponding major version of Greenplum. The following example will assume - that PXF version `1.2.3` will be installed to work with with Greenplum 5. -2. Decide which OS user will own the PXF installation. If PXF is installed alongside Greenplum, the user that owns the PXF -installation should either be the same as the one owning the Greenplum installation or have write privilleges to the -Greenplum installation directory. This is necessary to be able to register the PXF Greenplum extension with Greenplum. +1. Build or download PXF RPM for Apache Cloudberry. The following example will assume + that PXF version `1.2.3` will be installed to work with Apache Cloudberry. +2. Decide which OS user will own the PXF installation. If PXF is installed alongside Apache Cloudberry, the user that owns the PXF +installation should either be the same as the one owning the Cloudberry installation or have write privileges to the +Cloudberry installation directory. This is necessary to be able to register the PXF Apache Cloudberry extension with Cloudberry. 3. If a previous PXF version has been installed, stop the PXF server. -4. As a superuser, run `rpm -Uvh pxf-gp5-1.2.3-1.el7.x86_64.rpm` to install the RPM into `/usr/local/pxf-gp5` -5. As a superuser, run `chown gpadmin:gpadmin /usr/local/pxf-gp5` to change ownership of PXF installation to the user `gpadmin`. +4. As a superuser, run `rpm -Uvh apache-cloudberry-pxf-incubating-1.2.3-1.el7.x86_64.rpm` to install the RPM into `/usr/local/cloudberry-pxf-1.2.3` +5. As a superuser, run `chown gpadmin:gpadmin /usr/local/cloudberry-pxf-1.2.3` to change ownership of PXF installation to the user `gpadmin`. Specify a different user other than `gpadmin`, if desired. After these steps, the PXF product will be installed and is ready to be configured. If there was a previous installation of -PXF for the same major Greenplum version, the files and the runtime directories from the older version will be removed. +PXF, the files and the runtime directories from the older version will be removed. The PXF configuration directory should remain intact. You will need to have Java installed to run the PXF server. ## PXF removal process To remove the installed PXF package, follow these steps: 1. Stop the PXF server. -2. As a superuser, run `rpm -e pxf-gp5` (or `rpm -e pxf-gp6`). This will remove all files installed by the RPM package +2. As a superuser, run `rpm -e apache-cloudberry-pxf-incubating`. This will remove all files installed by the RPM package and the PXF runtime directories. The PXF configuration directory should remain intact. diff --git a/package/cloudberry-pxf.spec b/package/cloudberry-pxf.spec index 44a144e6e..24fea7810 100644 --- a/package/cloudberry-pxf.spec +++ b/package/cloudberry-pxf.spec @@ -6,7 +6,7 @@ # Disable automatic dependency processing both for requirements and provides AutoReqProv: no -Name: cloudberry-pxf +Name: apache-cloudberry-pxf-incubating Version: %{pxf_version} Release: %{pxf_release}%{?dist} @@ -16,7 +16,7 @@ URL: https://cloudberry.apache.org Vendor: %{vendor} Group: Applications/Databases -Prefix: /usr/local/%{name}-%{version} +Prefix: /usr/local/cloudberry-pxf-%{version} # Java server can be installed on a new node, only bash is needed for # management scripts @@ -27,8 +27,6 @@ Requires: bash # installing on Cloudberry node, so inherit Cloudberry's dependencies # implicitly -Requires: cloudberry-db - # Weak dependencies either OpenJDK 8 or 11 Suggests: java-1.8.0-openjdk Suggests: java-11-openjdk @@ -80,7 +78,7 @@ fi %__cp -R %{_sourcedir}/* %{buildroot}/%{prefix} # Create symlink -%__ln_s %{prefix} %{buildroot}/usr/local/%{name} +%__ln_s %{prefix} %{buildroot}/usr/local/cloudberry-pxf %post sed -i "s|directory =.*|directory = '${RPM_INSTALL_PREFIX}/gpextable/'|g" "${RPM_INSTALL_PREFIX}/gpextable/pxf.control" @@ -93,7 +91,7 @@ fi %files %{prefix} -/usr/local/%{name} +/usr/local/cloudberry-pxf # If a file is not marked as a config file, or if a file has not been altered # since installation, then it will be silently replaced by the version from the @@ -131,5 +129,5 @@ fi %preun # Remove symlink on uninstall if [ $1 -eq 0 ] ; then - %__rm -f /usr/local/%{name} + %__rm -f /usr/local/cloudberry-pxf fi diff --git a/package/install_binary b/package/install_binary index 574a851b6..fbed1c71d 100755 --- a/package/install_binary +++ b/package/install_binary @@ -4,9 +4,9 @@ INSTALL_COMPONENT_SCRIPT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) PXF_VERSION=$(<"${INSTALL_COMPONENT_SCRIPT_DIR}/pxf/version") function show_help() { - echo "This script installs PXF Greenplum Extension and PXF Server" + echo "This script installs PXF Extension and PXF Server" echo "to install the PXF Extension :" - echo " set GPHOME environment variable to the installation directory of Greenplum Database" + echo " set GPHOME environment variable to the installation directory of Apache Cloudberry" echo "to install the PXF Server :" echo " either set PXF_HOME environment variable to the target PXF Server installation directory" echo " or set GPHOME environment variable to have PXF Server installed into \${GPHOME}/pxf" @@ -29,8 +29,8 @@ function check_gphome() { exit 1 fi else - if [[ ! -f ${GPHOME}/greenplum_path.sh ]]; then - echo "Error: environment variable GPHOME (${GPHOME}) must be set to a valid Greenplum installation" + if [[ ! -f ${GPHOME}/greenplum_path.sh ]] && [[ ! -f ${GPHOME}/cloudberry-env.sh ]]; then + echo "Error: environment variable GPHOME (${GPHOME}) must be set to a valid Apache Cloudberry installation" exit 1 fi if [[ -z "${PXF_HOME}" ]]; then @@ -58,11 +58,11 @@ function install_new_component() { echo "Installing PXF version ${PXF_VERSION} ..." if [[ ! -z "${GPHOME}" ]]; then - echo "... installing PXF Greenplum Extension into ${GPHOME} ..." + echo "... installing PXF Extension into ${GPHOME} ..." cp -av ${INSTALL_COMPONENT_SCRIPT_DIR}/{lib,share} ${GPHOME} check_status $? else - echo "... skipping PXF Greenplum Extension as GPHOME environment variable is not set ..." + echo "... skipping PXF Extension as GPHOME environment variable is not set ..." fi if [[ ! -z "${PXF_HOME}" ]]; then diff --git a/package/install_deb b/package/install_deb index de7db3880..9450b65c5 100755 --- a/package/install_deb +++ b/package/install_deb @@ -14,6 +14,6 @@ if [[ "${owner}" != "root" ]]; then fi ${sudo_if_needed} dpkg --install ${INSTALL_COMPONENT_SCRIPT_DIR}/${DEB_FILE} -${sudo_if_needed} chown --recursive ${owner} /usr/local/pxf-gp* +${sudo_if_needed} chown --recursive ${owner} /usr/local/cloudberry-pxf* -echo "Successfully installed PXF version $(cat /usr/local/pxf-gp*/version)" +echo "Successfully installed PXF version $(cat /usr/local/cloudberry-pxf*/version)" diff --git a/package/install_rpm b/package/install_rpm index 2c28767dc..d34d0e087 100755 --- a/package/install_rpm +++ b/package/install_rpm @@ -14,6 +14,6 @@ if [[ "${owner}" != "root" ]]; then fi ${sudo_if_needed} rpm -Uvh ${INSTALL_COMPONENT_SCRIPT_DIR}/${RPM_FILE} -${sudo_if_needed} chown -R ${owner} /usr/local/pxf-gp* +${sudo_if_needed} chown -R ${owner} /usr/local/cloudberry-pxf* -echo "Successfully installed PXF version $(cat /usr/local/pxf-gp*/version)" +echo "Successfully installed PXF version $(cat /usr/local/cloudberry-pxf*/version)" diff --git a/package/pxf-cbdb1.spec b/package/pxf-cbdb1.spec deleted file mode 100644 index f2812e34e..000000000 --- a/package/pxf-cbdb1.spec +++ /dev/null @@ -1,79 +0,0 @@ -# Disable repacking of jars, since it takes forever -%define __jar_repack %{nil} - -# Disable build-id in rpm -%define _build_id_links none -# Disable automatic dependency processing both for requirements and provides -AutoReqProv: no - -Name: pxf-cbdb1 -Version: %{pxf_version} -Release: 1%{?dist} -Summary: Cloudberry PXF framework for external data access -License: %{license} -URL: http://www.hashdata.cn -Vendor: %{vendor} - -Prefix: /usr/local/%{name} - -# .so file makes sense only when installing on Cloudberry node, so inherit Cloudberry's dependencies implicitly -# Java server can be installed on a new node, only bash is needed for management scripts -## cbdb has added this requirement, pxf may installed under GPHOME, -# this requirement will cause installation fail. -# Requires: bash - -%description -PXF is an extensible framework that allows a distributed database like Cloudberry to query external data files, -whose metadata is not managed by the database. PXF includes built-in connectors for accessing data that exists -inside HDFS files, Hive tables, HBase tables, databases that support JDBC, data stores (S3, GCS) and more. - -%prep -# If the pxf_version macro is not defined, it gets interpreted as a literal string, need %% to escape it -if [ %{pxf_version} = '%%{pxf_version}' ] ; then - echo "The macro (variable) pxf_version must be supplied as rpmbuild ... --define='pxf_version [VERSION]'" - exit 1 -fi - -%install -%__mkdir -p %{buildroot}/%{prefix} -%__cp -R %{_sourcedir}/* %{buildroot}/%{prefix} - -%post -sed -i "s|directory =.*|directory = '${RPM_INSTALL_PREFIX}/fdw/'|g" "${RPM_INSTALL_PREFIX}/fdw/pxf_fdw.control" -sed -i "s|module_pathname =.*|module_pathname = '${RPM_INSTALL_PREFIX}/fdw/pxf_fdw'|g" "${RPM_INSTALL_PREFIX}/fdw/pxf_fdw.control" - -%files -%{prefix} - -# If a file is not marked as a config file, or if a file has not been altered -# since installation, then it will be silently replaced by the version from the -# RPM. - -# If a config file has been edited on disk, but is not actually different from -# the file in the RPM then the edited version will be silently left in place. - -# When a config file has been edited and is different from the file in -# the RPM, then the behavior is the following: -# - %config(noreplace): The edited version will be left in place, and the new -# version will be installed with an .rpmnew suffix. -# - %config: The new file will be installed, and the the old edited version -# will be renamed with an .rpmsave suffix. - -# Configuration directories/files -%config(noreplace) %{prefix}/conf/pxf-application.properties -%config(noreplace) %{prefix}/conf/pxf-env.sh -%config(noreplace) %{prefix}/conf/pxf-log4j2.xml -%config(noreplace) %{prefix}/conf/pxf-profiles.xml - -%pre -# cleanup files and directories created by 'pxf init' command -# only applies for old installations (pre 6.0.0) -%__rm -f "${RPM_INSTALL_PREFIX}/conf/pxf-private.classpath" -%__rm -rf "${RPM_INSTALL_PREFIX}/pxf-service" - -%posttrans -# PXF v5 RPM installation removes the run directory during the %preun step. -# The lack of run directory prevents PXF v6+ from starting up. -# %posttrans of the new package is the only step that runs after the %preun -# of the old package -%{__install} -d -m 700 "${RPM_INSTALL_PREFIX}/run" diff --git a/package/pxf-gp7.spec b/package/pxf-gp7.spec deleted file mode 100644 index 4b22e786a..000000000 --- a/package/pxf-gp7.spec +++ /dev/null @@ -1,79 +0,0 @@ -# Disable repacking of jars, since it takes forever -%define __jar_repack %{nil} - -# Disable automatic dependency processing both for requirements and provides -AutoReqProv: no - -Name: pxf-gp7 -Version: %{pxf_version} -Release: %{pxf_release}%{?dist} -Summary: Greenplum PXF framework for external data access -License: %{license} -URL: https://github.com/greenplum-db/pxf -Vendor: %{vendor} - -Prefix: /usr/local/%{name} - -# .so file makes sense only when installing on Greenplum node, so inherit Greenplum's dependencies implicitly -# Java server can be installed on a new node, only bash is needed for management scripts -Requires: bash - -%description -PXF is an extensible framework that allows a distributed database like Greenplum to query external data files, -whose metadata is not managed by the database. PXF includes built-in connectors for accessing data that exists -inside HDFS files, Hive tables, HBase tables, databases that support JDBC, data stores (S3, GCS) and more. - -%prep -# If the pxf_version macro is not defined, it gets interpreted as a literal string, need %% to escape it -if [ %{pxf_version} = '%%{pxf_version}' ] ; then - echo "The macro (variable) pxf_version must be supplied as rpmbuild ... --define='pxf_version [VERSION]'" - exit 1 -fi - -%install -%__mkdir -p %{buildroot}/%{prefix} -%__cp -R %{_sourcedir}/* %{buildroot}/%{prefix} - -%post -sed -i "s|directory =.*|directory = '${RPM_INSTALL_PREFIX}/gpextable/'|g" "${RPM_INSTALL_PREFIX}/gpextable/pxf.control" -sed -i "s|module_pathname =.*|module_pathname = '${RPM_INSTALL_PREFIX}/gpextable/pxf'|g" "${RPM_INSTALL_PREFIX}/gpextable/pxf.control" -sed -i "s|directory =.*|directory = '${RPM_INSTALL_PREFIX}/fdw/'|g" "${RPM_INSTALL_PREFIX}/fdw/pxf_fdw.control" -sed -i "s|module_pathname =.*|module_pathname = '${RPM_INSTALL_PREFIX}/fdw/pxf_fdw'|g" "${RPM_INSTALL_PREFIX}/fdw/pxf_fdw.control" -sed -i "s|directory =.*|directory = '${RPM_INSTALL_PREFIX}/gpextable/'|g" "${RPM_INSTALL_PREFIX}/gpextable/pxf.control" -sed -i "s|module_pathname =.*|module_pathname = '${RPM_INSTALL_PREFIX}/gpextable/pxf'|g" "${RPM_INSTALL_PREFIX}/gpextable/pxf.control" - -%files -%{prefix} - -# If a file is not marked as a config file, or if a file has not been altered -# since installation, then it will be silently replaced by the version from the -# RPM. - -# If a config file has been edited on disk, but is not actually different from -# the file in the RPM then the edited version will be silently left in place. - -# When a config file has been edited and is different from the file in -# the RPM, then the behavior is the following: -# - %config(noreplace): The edited version will be left in place, and the new -# version will be installed with an .rpmnew suffix. -# - %config: The new file will be installed, and the the old edited version -# will be renamed with an .rpmsave suffix. - -# Configuration directories/files -%config(noreplace) %{prefix}/conf/pxf-application.properties -%config(noreplace) %{prefix}/conf/pxf-env.sh -%config(noreplace) %{prefix}/conf/pxf-log4j2.xml -%config(noreplace) %{prefix}/conf/pxf-profiles.xml - -%pre -# cleanup files and directories created by 'pxf init' command -# only applies for old installations (pre 6.0.0) -%__rm -f "${RPM_INSTALL_PREFIX}/conf/pxf-private.classpath" -%__rm -rf "${RPM_INSTALL_PREFIX}/pxf-service" - -%posttrans -# PXF v5 RPM installation removes the run directory during the %preun step. -# The lack of run directory prevents PXF v6+ from starting up. -# %posttrans of the new package is the only step that runs after the %preun -# of the old package -%{__install} -d -m 700 "${RPM_INSTALL_PREFIX}/run" diff --git a/regression/README.md b/regression/README.md index e9ce59044..44e7ab8b3 100644 --- a/regression/README.md +++ b/regression/README.md @@ -14,7 +14,7 @@ Running the tests ## Pre-requisites You need a running instance of Greenplum and PXF, along with a local installation of Greenplum (to be able to use the `pg_regress` framework). -The variables `PGHOST` and `PGPORT` must be pointing at the Greenplum master node, and Greenplum environment scripts like `${GPHOME}/greenplum_path.sh` and `gpdb/gpAux/gpdemo/gpdemo-env.sh` should be sourced. +The variables `PGHOST` and `PGPORT` must be pointing at the Greenplum master node, and Greenplum environment scripts like `${GPHOME}/greenplum_path.sh` (for Cloudberry 2.0) or `${GPHOME}/cloudberry-env.sh` (for Cloudberry 2.1+) should be sourced. `pg_config` must be on your path. For data prep, the appropriate CLIs are required, as we shell out from SQL to these CLIs. These include `hdfs`, `hbase`, and `beeline`. @@ -46,8 +46,7 @@ By setting environment variables you can change the location of the Greenplum ma ### General environment variables -All the general environment variables that come from `greenplum_path.sh` and -`gpdemo-env.sh` must be set. Additionally, `PXF_BASE` must be set if different +All the general environment variables that come from `greenplum_path.sh` (for Cloudberry 2.0) or `cloudberry-env.sh` (for Cloudberry 2.1+) must be set. Additionally, `PXF_BASE` must be set if different from `PXF_HOME`. * `PXF_TEST_DEBUG`: set to anything to prevent deletion of data, and to run `pg_regress` in debug mode (optional) diff --git a/server/build.gradle b/server/build.gradle index a1b6275c2..3eb6b087c 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -90,15 +90,17 @@ configure(javaProjects) { dependency("commons-configuration:commons-configuration:1.10") dependency("commons-io:commons-io:2.7") dependency("commons-lang:commons-lang:2.6") + dependency("commons-lang:commons-lang3:3.9") dependency("commons-logging:commons-logging:1.1.3") - dependency("io.airlift:aircompressor:0.27") + dependency("io.airlift:aircompressor:2.0.2") dependency("javax.jdo:jdo-api:3.0.1") dependency("joda-time:joda-time:2.8.1") dependency("net.sf.opencsv:opencsv:2.3") dependency("org.antlr:antlr-runtime:3.5.2") dependency("org.apache.commons:commons-compress:1.20") + dependency("org.apache.commons:commons-crypto:1.0.0") dependency("org.apache.htrace:htrace-core:3.1.0-incubating") - dependency("org.apache.htrace:htrace-core4:4.0.1-incubating") + dependency("org.apache.htrace:htrace-core4:4.2.0-incubating") dependency("org.apache.zookeeper:zookeeper:3.4.6") dependency("org.codehaus.woodstox:stax2-api:3.1.4") @@ -120,7 +122,7 @@ configure(javaProjects) { dependency("org.threeten:threeten-extra:1.5.0") dependency("org.tukaani:xz:1.8") dependency("org.wildfly.openssl:wildfly-openssl:1.0.7.Final") - dependency("org.xerial.snappy:snappy-java:1.1.10.4") + dependency("org.xerial.snappy:snappy-java:1.1.10.7") // Hadoop dependencies dependencySet(group:"org.apache.hadoop", version:"${hadoopVersion}") { @@ -139,11 +141,28 @@ configure(javaProjects) { // HBase dependencies dependencySet(group:"org.apache.hbase", version:"${hbaseVersion}") { - entry("hbase-annotations") entry("hbase-client") entry("hbase-common") entry("hbase-protocol") + entry("hbase-protocol-shaded") + entry("hbase-logging") + entry("hbase-hadoop-compat") + entry("hbase-hadoop2-compat") + entry("hbase-metrics-api") + entry("hbase-metrics") } + dependencySet(group:"org.apache.hbase.thirdparty", version:"3.3.0") { + entry("hbase-shaded-protobuf") + entry("hbase-shaded-miscellaneous") + entry("hbase-shaded-gson") + entry("hbase-shaded-netty") + entry("hbase-unsafe") + } + dependency("org.apache.yetus:audience-annotations:0.5.0") + dependency("io.opentelemetry:opentelemetry-api:1.49.0") + dependency("io.opentelemetry:opentelemetry-context:1.49.0") + dependency("io.opentelemetry.semconv:opentelemetry-semconv:1.29.0-alpha") + dependency("io.dropwizard.metrics:metrics-core:3.2.6") // Hive dependencies dependency("org.apache.hive:hive-storage-api:${hiveStorageApiVersion}") @@ -193,7 +212,7 @@ configure(javaProjects) { entry("avro") entry("avro-mapred") } - // Zstd support for Avro + // Zstd support for Avro/Parquet dependency("com.github.luben:zstd-jni:1.5.7-6") // Jackson 1.x dependencies @@ -237,7 +256,7 @@ configure(javaProjects) { options.compilerArgs += [ "-g", "-Xlint:varargs", "-Xlint:cast", "-Xlint:classfile", "-Xlint:dep-ann", "-Xlint:divzero", "-Xlint:empty", "-Xlint:finally", "-Xlint:overrides", "-Xlint:path", "-Xlint:-processing", "-Xlint:static", - "-Xlint:try", "-Xlint:fallthrough", "-Xlint:deprecation", "-Xlint:unchecked", "-Xlint:-options", "-Werror" + "-Xlint:try", "-Xlint:fallthrough", "-Xlint:unchecked", "-Xlint:-options", "-Werror" ] } @@ -245,7 +264,7 @@ configure(javaProjects) { options.compilerArgs += [ "-g", "-Xlint:varargs", "-Xlint:cast", "-Xlint:classfile", "-Xlint:dep-ann", "-Xlint:divzero", "-Xlint:empty", "-Xlint:finally", "-Xlint:overrides", "-Xlint:path", "-Xlint:-processing", "-Xlint:static", - "-Xlint:try", "-Xlint:fallthrough", "-Xlint:deprecation", "-Xlint:unchecked", "-Xlint:-options", "-Werror" + "-Xlint:try", "-Xlint:fallthrough", "-Xlint:unchecked", "-Xlint:-options", "-Werror" ] } diff --git a/server/gradle.properties b/server/gradle.properties index 42da880a3..e0f416347 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -21,9 +21,9 @@ license=ASL 2.0 hadoopVersion=2.10.2 hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 -hbaseVersion=1.3.2 +hbaseVersion=2.3.7 junitVersion=4.11 -parquetVersion=1.12.3 +parquetVersion=1.15.2 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true diff --git a/server/pxf-hbase/build.gradle b/server/pxf-hbase/build.gradle index 026d602b9..5924b8f22 100644 --- a/server/pxf-hbase/build.gradle +++ b/server/pxf-hbase/build.gradle @@ -14,23 +14,47 @@ dependencies { *******************************/ compileOnly("com.google.code.findbugs:annotations") - compileOnly("org.apache.hbase:hbase-annotations") /******************************* - * Implementation Dependencies + * Project Dependencies *******************************/ implementation(project(':pxf-api')) - implementation("com.google.protobuf:protobuf-java") implementation("commons-collections:commons-collections") - implementation("org.apache.hbase:hbase-client") { transitive = false } - implementation("org.apache.hbase:hbase-common") { transitive = false } - implementation("org.apache.hbase:hbase-protocol") { transitive = false } - implementation("org.apache.htrace:htrace-core") { transitive = false } - implementation("org.apache.zookeeper:zookeeper") { transitive = false } - implementation("io.netty:netty-common") { transitive = false } - implementation("io.netty:netty-transport") { transitive = false } - implementation("com.yammer.metrics:metrics-core") { transitive = false } + + /******************************* + * Hbase + *******************************/ + + implementation("org.apache.hbase:hbase-client") { transitive = false } + implementation("org.apache.hbase.thirdparty:hbase-shaded-protobuf") { transitive = false } + implementation("org.apache.hbase:hbase-common") { transitive = false } + implementation("org.apache.hbase:hbase-logging") { transitive = false } + implementation("org.apache.hbase.thirdparty:hbase-shaded-miscellaneous") { transitive = false } + implementation("org.apache.hbase.thirdparty:hbase-shaded-gson") { transitive = false } + implementation("org.apache.hbase.thirdparty:hbase-shaded-netty") { transitive = false } + implementation("org.apache.commons:commons-lang3") { transitive = false } + implementation("org.apache.commons:commons-crypto") { transitive = false } + implementation("org.apache.hadoop:hadoop-common") { transitive = false } + implementation("org.apache.hadoop:hadoop-auth") { transitive = false } + implementation("org.apache.hbase:hbase-hadoop-compat") { transitive = false } + implementation("org.apache.hbase:hbase-metrics-api") { transitive = false } + implementation("org.apache.hbase:hbase-metrics") { transitive = false } + implementation("org.apache.hbase:hbase-hadoop2-compat") { transitive = false } + implementation("org.apache.hbase:hbase-protocol-shaded") { transitive = false } + implementation("org.apache.hbase:hbase-protocol") { transitive = false } + implementation("com.google.protobuf:protobuf-java") { transitive = false } + implementation("org.apache.zookeeper:zookeeper") { transitive = false } + implementation("io.netty:netty-common") { transitive = false } + implementation("io.netty:netty-transport") { transitive = false } +// skip JRuby - it is part of interactive shell +// implementation("org.jruby.jcodings:jcodings:1.0.58") { transitive = false } +// implementation("org.jruby.joni:joni:2.2.1") { transitive = false } + implementation("org.apache.yetus:audience-annotations") { transitive = false } + implementation("io.opentelemetry:opentelemetry-api") { transitive = false } + implementation("io.opentelemetry:opentelemetry-context") { transitive = false } + implementation("io.opentelemetry.semconv:opentelemetry-semconv") { transitive = false } + implementation("io.dropwizard.metrics:metrics-core:3.2.6") { transitive = false } implementation("org.springframework.boot:spring-boot-starter-log4j2") @@ -39,7 +63,6 @@ dependencies { *******************************/ testCompileOnly("com.google.code.findbugs:annotations") - testCompileOnly("org.apache.hbase:hbase-annotations") testImplementation("com.esotericsoftware:minlog") testImplementation("com.esotericsoftware:reflectasm") testImplementation('org.springframework.boot:spring-boot-starter-test') diff --git a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/HBaseDataFragmenter.java b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/HBaseDataFragmenter.java index 21d5d42d7..774fb9c22 100644 --- a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/HBaseDataFragmenter.java +++ b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/HBaseDataFragmenter.java @@ -81,7 +81,6 @@ public FragmentStats getFragmentStats() { public List getFragments() throws Exception { // check that Zookeeper and HBase master are available - HBaseAdmin.checkHBaseAvailable(configuration); connection = ConnectionFactory.createConnection(configuration); Admin hbaseAdmin = connection.getAdmin(); if (!HBaseUtilities.isTableAvailable(hbaseAdmin, context.getDataSource())) { diff --git a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseDoubleComparator.java b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseDoubleComparator.java index 3627ca972..6dedaf850 100644 --- a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseDoubleComparator.java +++ b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseDoubleComparator.java @@ -19,11 +19,11 @@ * under the License. */ -import com.google.protobuf.ByteString; -import com.google.protobuf.InvalidProtocolBufferException; +import org.apache.hbase.thirdparty.com.google.protobuf.ByteString; +import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException; import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.filter.ByteArrayComparable; -import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos; import org.apache.hadoop.hbase.util.Bytes; public class HBaseDoubleComparator extends ByteArrayComparable { diff --git a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseFloatComparator.java b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseFloatComparator.java index bf26b0033..a5291de62 100644 --- a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseFloatComparator.java +++ b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseFloatComparator.java @@ -19,11 +19,11 @@ * under the License. */ -import com.google.protobuf.ByteString; -import com.google.protobuf.InvalidProtocolBufferException; +import org.apache.hbase.thirdparty.com.google.protobuf.ByteString; +import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException; import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.filter.ByteArrayComparable; -import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos; import org.apache.hadoop.hbase.util.Bytes; public class HBaseFloatComparator extends ByteArrayComparable{ diff --git a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseIntegerComparator.java b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseIntegerComparator.java index ffc0a10eb..a1e589c52 100644 --- a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseIntegerComparator.java +++ b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseIntegerComparator.java @@ -23,11 +23,11 @@ import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.filter.ByteArrayComparable; import org.apache.hadoop.hbase.filter.SubstringComparator; -import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos; import org.apache.hadoop.hbase.util.Bytes; -import com.google.protobuf.ByteString; -import com.google.protobuf.InvalidProtocolBufferException; +import org.apache.hbase.thirdparty.com.google.protobuf.ByteString; +import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException; /** * This is a Filter comparator for HBase It is external to PXF HBase code. diff --git a/server/pxf-service/src/scripts/pxf b/server/pxf-service/src/scripts/pxf index 0a4229833..5e3f09054 100755 --- a/server/pxf-service/src/scripts/pxf +++ b/server/pxf-service/src/scripts/pxf @@ -217,7 +217,7 @@ function doHelp() { restart restart the local PXF server instance (not supported for cluster) status show the status of the local PXF server instance version show the version of PXF server - register install PXF extension under \$GPHOME (useful after upgrades of Greenplum server) + register install PXF extension under \$GPHOME (useful after upgrades of Cloudberry server) prepare prepares a new base directory specified by the \$PXF_BASE environment variable. It creates the servers, logs, lib, keytabs, and run directories inside \$PXF_BASE and copies configuration files. @@ -253,11 +253,11 @@ function doReset() { function installExtensions() { if [[ -d ${parent_script_dir}/gpextable ]]; then if [[ -z "${GPHOME}" ]]; then - echoYellow 'WARNING: environment variable GPHOME is not set, skipping install of Greenplum External Table PXF Extension' - elif [[ ! -f ${GPHOME}/greenplum_path.sh ]]; then - echoYellow "WARNING: environment variable GPHOME (${GPHOME}) must be set to a valid Greenplum installation, skipping install of Greenplum External Table PXF Extension'" + echoYellow 'WARNING: environment variable GPHOME is not set, skipping install of Cloudberry External Table PXF Extension' + elif [[ ! -f ${GPHOME}/greenplum_path.sh && ! -f ${GPHOME}/cloudberry-env.sh ]]; then + echoYellow "WARNING: environment variable GPHOME (${GPHOME}) must be set to a valid Cloudberry installation, skipping install of Cloudberry External Table PXF Extension'" else - echoGreen "Installing Greenplum External Table PXF Extension into '${GPHOME}'" + echoGreen "Installing Cloudberry External Table PXF Extension into '${GPHOME}'" local target_control_file="${GPHOME}/share/postgresql/extension/pxf.control" install --verbose --mode=0644 "${parent_script_dir}/gpextable/pxf.control" "${target_control_file}" || fail "cannot install pxf.control to '${target_control_file}'" @@ -265,11 +265,11 @@ function installExtensions() { fi if [[ -d ${parent_script_dir}/fdw ]]; then if [[ -z "${GPHOME}" ]]; then - echoYellow 'WARNING: environment variable GPHOME is not set, skipping install of Greenplum Foreign Data Wrapper PXF Extension' - elif [[ ! -f ${GPHOME}/greenplum_path.sh ]]; then - echoYellow "WARNING: environment variable GPHOME (${GPHOME}) must be set to a valid Greenplum installation, skipping install of Greenplum Foreign Data Wrapper PXF Extension'" + echoYellow 'WARNING: environment variable GPHOME is not set, skipping install of Cloudberry Foreign Data Wrapper PXF Extension' + elif [[ ! -f ${GPHOME}/greenplum_path.sh && ! -f ${GPHOME}/cloudberry-env.sh ]]; then + echoYellow "WARNING: environment variable GPHOME (${GPHOME}) must be set to a valid Cloudberry installation, skipping install of Cloudberry Foreign Data Wrapper PXF Extension'" else - echoGreen "Installing Greenplum Foreign Data Wrapper PXF Extension into '${GPHOME}'" + echoGreen "Installing Cloudberry Foreign Data Wrapper PXF Extension into '${GPHOME}'" local target_control_file="${GPHOME}/share/postgresql/extension/pxf_fdw.control" install --verbose --mode=0644 "${parent_script_dir}/fdw/pxf_fdw.control" "${target_control_file}" || fail "cannot install pxf_fdw.control to '${target_control_file}'" diff --git a/server/pxf-service/src/scripts/pxf-post-gpupgrade b/server/pxf-service/src/scripts/pxf-post-gpupgrade index 5f017da43..59aa3ad1e 100755 --- a/server/pxf-service/src/scripts/pxf-post-gpupgrade +++ b/server/pxf-service/src/scripts/pxf-post-gpupgrade @@ -56,24 +56,24 @@ EOF metadata_file="${PXF_HOME}/gpextable/metadata" pxf_gpdb_major_version="" if [[ -f "${metadata_file}" ]]; then - pxf_gpdb_major_version="$(awk 'BEGIN { FS = \"=\" } /cloudberry.major-version/{ print $2 }' \"${metadata_file}\")" + pxf_gpdb_major_version="$(awk 'BEGIN { FS = "=" } /cloudberry.major-version/{ print $2 }' "${metadata_file}")" else echo "WARNING: metadata file '${metadata_file}' not found; skipping PXF/GPDB compatibility check" >>"${log_file}" fi gp_version="$(psql --no-align --tuples-only --command "SHOW server_version")" pxf_version="$(cat "${PXF_HOME}"/version)" -echo "PXF ${pxf_version} compiled against GPDB major version '${pxf_gpdb_major_version}'" >>"${log_file}" -echo "Running GPDB cluster is version '${gp_version}'" >>"${log_file}" +echo "PXF ${pxf_version} compiled against Cloudberry major version '${pxf_gpdb_major_version}'" >>"${log_file}" +echo "Running Cloudberry cluster is version '${gp_version}'" >>"${log_file}" if [[ -n "${pxf_gpdb_major_version}" && "${pxf_gpdb_major_version}" != "${gp_version%%.*}" ]]; then - echo "ERROR: This version of PXF only works with GPDB ${pxf_gpdb_major_version}+ but the targeted GPDB cluster is ${gp_version}" | tee -a "${log_file}" + echo "ERROR: This version of PXF only works with Cloudberry ${pxf_gpdb_major_version}+ but the targeted Cloudberry cluster is ${gp_version}" | tee -a "${log_file}" exit 1 fi -master_data_dir_query="SELECT datadir FROM pg_catalog.gp_segment_configuration WHERE dbid = 1" -export MASTER_DATA_DIRECTORY="${MASTER_DATA_DIRECTORY:-$(psql --no-align --tuples-only --command "${master_data_dir_query}")}" -echo "GPDB master data directory is '${MASTER_DATA_DIRECTORY}'" >>"${log_file}" +coordinator_data_dir_query="SELECT datadir FROM pg_catalog.gp_segment_configuration WHERE dbid = 1" +export COORDINATOR_DATA_DIRECTORY="${COORDINATOR_DATA_DIRECTORY:-$(psql --no-align --tuples-only --command "${coordinator_data_dir_query}")}" +echo "Cloudberry coordinator data directory is '${COORDINATOR_DATA_DIRECTORY}'" >>"${log_file}" if [[ -d "${PXF_HOME}/gpextable" ]]; then PXF_HOME_REGEX="(.*:)*\/gpextable.*" diff --git a/server/pxf-service/src/scripts/pxf-pre-gpupgrade b/server/pxf-service/src/scripts/pxf-pre-gpupgrade index 1306aa400..c7a69535e 100755 --- a/server/pxf-service/src/scripts/pxf-pre-gpupgrade +++ b/server/pxf-service/src/scripts/pxf-pre-gpupgrade @@ -56,24 +56,24 @@ EOF metadata_file="${PXF_HOME}/gpextable/metadata" pxf_gpdb_major_version="" if [[ -f "${metadata_file}" ]]; then - pxf_gpdb_major_version="$(awk 'BEGIN { FS = \"=\" } /cloudberry.major-version/{ print $2 }' \"${metadata_file}\")" + pxf_gpdb_major_version="$(awk 'BEGIN { FS = "=" } /cloudberry.major-version/{ print $2 }' "${metadata_file}")" else - echo "WARNING: metadata file '${metadata_file}' not found; skipping PXF/GPDB compatibility check" >>"${log_file}" + echo "WARNING: metadata file '${metadata_file}' not found; skipping PXF/Cloudberry compatibility check" >>"${log_file}" fi gp_version="$(psql --no-align --tuples-only --command "SHOW server_version")" pxf_version="$(cat "${PXF_HOME}"/version)" -echo "PXF ${pxf_version} compiled against GPDB major version '${pxf_gpdb_major_version}'" >>"${log_file}" -echo "Running GPDB cluster is version '${gp_version}'" >>"${log_file}" +echo "PXF ${pxf_version} compiled against Cloudberry major version '${pxf_gpdb_major_version}'" >>"${log_file}" +echo "Running Cloudberry cluster is version '${gp_version}'" >>"${log_file}" if [[ -n "${pxf_gpdb_major_version}" && "${pxf_gpdb_major_version}" != "${gp_version%%.*}" ]]; then - echo "ERROR: This version of PXF only works with GPDB ${pxf_gpdb_major_version}+ but the targeted GPDB cluster is ${gp_version}" | tee -a "${log_file}" + echo "ERROR: This version of PXF only works with Cloudberry ${pxf_gpdb_major_version}+ but the targeted Cloudberry cluster is ${gp_version}" | tee -a "${log_file}" exit 1 fi -master_data_dir_query="SELECT datadir FROM pg_catalog.gp_segment_configuration WHERE dbid = 1" -export MASTER_DATA_DIRECTORY="${MASTER_DATA_DIRECTORY:-$(psql --no-align --tuples-only --command "${master_data_dir_query}")}" -echo "GPDB master data directory is '${MASTER_DATA_DIRECTORY}'" >>"${log_file}" +coordinator_data_dir_query="SELECT datadir FROM pg_catalog.gp_segment_configuration WHERE dbid = 1" +export COORDINATOR_DATA_DIRECTORY="${COORDINATOR_DATA_DIRECTORY:-$(psql --no-align --tuples-only --command "${coordinator_data_dir_query}")}" +echo "Cloudberry coordinator data directory is '${COORDINATOR_DATA_DIRECTORY}'" >>"${log_file}" if [[ -d "${PXF_HOME}/gpextable" ]]; then PXF_HOME_REGEX="(.*:)*\/gpextable.*"