From 237d17261f79ade922346f674d3ce756ca71fe09 Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Fri, 6 Feb 2026 08:26:56 +0500 Subject: [PATCH 1/2] Update parquet to 1.12.3 in PXF Update parquet to 1.12.3 (latest version with hadoop-client-2.x) * Add support for ZSTD compression * Temporary declare LZO as not supported. It causes following error in both current and in pre-upgrade builds: ``` ERROR: PXF server error : Class com.hadoop.compression.lzo.LzoCodec was not found (seg1 10.11.0.131:6000 pid=2567556) ``` * Add tests to cover different types of compression --- .../features/parquet/ParquetWriteTest.java | 15 +++++++++++++++ docs/content/hdfs_parquet.md | 4 ++-- server/build.gradle | 3 ++- server/gradle.properties | 2 +- server/pxf-hdfs/build.gradle | 1 + 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/automation/src/test/java/org/greenplum/pxf/automation/features/parquet/ParquetWriteTest.java b/automation/src/test/java/org/greenplum/pxf/automation/features/parquet/ParquetWriteTest.java index 8bd235b66..0a12b1904 100644 --- a/automation/src/test/java/org/greenplum/pxf/automation/features/parquet/ParquetWriteTest.java +++ b/automation/src/test/java/org/greenplum/pxf/automation/features/parquet/ParquetWriteTest.java @@ -195,6 +195,21 @@ public void parquetWritePrimitivesGZipClassName() throws Exception { runWritePrimitivesScenario("pxf_parquet_write_primitives_gzip_classname", "pxf_parquet_read_primitives_gzip_classname", "parquet_write_primitives_gzip_classname", new String[]{"COMPRESSION_CODEC=org.apache.hadoop.io.compress.GzipCodec"}); } + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesSnappy() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_snappy", "pxf_parquet_read_primitives_snappy", "parquet_write_primitives_snappy", new String[]{"COMPRESSION_CODEC=snappy"}); + } + + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesUncompressed() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_uncompressed", "pxf_parquet_read_primitives_uncompressed", "parquet_write_primitives_uncompressed", new String[]{"COMPRESSION_CODEC=uncompressed"}); + } + + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesZStd() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_zstd", "pxf_parquet_read_primitives_zstd", "parquet_write_primitives_zstd", new String[]{"COMPRESSION_CODEC=zstd"}); + } + // Numeric precision not defined, test writing data precision in [1, 38]. All the data should be written correctly. @Test(groups = {"features", "gpdb", "security", "hcfs"}) public void parquetWriteUndefinedPrecisionNumeric() throws Exception { diff --git a/docs/content/hdfs_parquet.md b/docs/content/hdfs_parquet.md index 26ee4817e..9ad05b785 100644 --- a/docs/content/hdfs_parquet.md +++ b/docs/content/hdfs_parquet.md @@ -23,7 +23,7 @@ under the License. Use the PXF HDFS connector to read and write Parquet-format data. This section describes how to read and write HDFS files that are stored in Parquet format, including how to create, query, and insert into external tables that reference files in the HDFS data store. -PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, and `lzo`. +PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, and `zstd`. PXF currently supports reading and writing primitive Parquet data types only. @@ -182,7 +182,7 @@ The PXF `hdfs:parquet` profile supports encoding- and compression-related write | Write Option | Value Description | |-------|-------------------------------------| -| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `lzo`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | +| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `zstd`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | | ROWGROUP_SIZE | A Parquet file consists of one or more row groups, a logical partitioning of the data into rows. `ROWGROUP_SIZE` identifies the size (in bytes) of the row group. The default row group size is `8 * 1024 * 1024` bytes. | | PAGE_SIZE | A row group consists of column chunks that are divided up into pages. `PAGE_SIZE` is the size (in bytes) of such a page. The default page size is `1 * 1024 * 1024` bytes. | | ENABLE\_DICTIONARY | A boolean value that specifies whether or not to enable dictionary encoding. The default value is `true`; dictionary encoding is enabled when PXF writes Parquet files. | diff --git a/server/build.gradle b/server/build.gradle index ab01dfd2a..5a9d00314 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -179,7 +179,7 @@ configure(javaProjects) { } // Parquet dependencies - dependency("org.apache.parquet:parquet-format:2.7.0") + dependency("org.apache.parquet:parquet-format:2.11.0") dependencySet(group:"org.apache.parquet", version:"${parquetVersion}") { entry("parquet-column") entry("parquet-common") @@ -188,6 +188,7 @@ configure(javaProjects) { entry("parquet-hadoop") entry("parquet-jackson") entry("parquet-pig") + entry("parquet-format-structures") } // Thrift dependencies diff --git a/server/gradle.properties b/server/gradle.properties index a56fe6c24..ee2bba966 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -23,7 +23,7 @@ hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 hbaseVersion=2.3.7 junitVersion=4.11 -parquetVersion=1.11.1 +parquetVersion=1.12.3 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true diff --git a/server/pxf-hdfs/build.gradle b/server/pxf-hdfs/build.gradle index a59512038..a087c1210 100644 --- a/server/pxf-hdfs/build.gradle +++ b/server/pxf-hdfs/build.gradle @@ -38,6 +38,7 @@ dependencies { implementation("org.apache.hadoop:hadoop-hdfs") { transitive = false } implementation("org.apache.hadoop:hadoop-hdfs-client") { transitive = false } implementation("org.apache.parquet:parquet-format") { transitive = false } + implementation("org.apache.parquet:parquet-format-structures") { transitive = false } implementation("org.apache.parquet:parquet-column") { transitive = false } implementation("org.apache.parquet:parquet-common") { transitive = false } implementation("org.apache.parquet:parquet-encoding") { transitive = false } From 9d9a2e96cb9210c914ff1745449718c90476b8cd Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Thu, 12 Feb 2026 14:49:55 +0500 Subject: [PATCH 2/2] bump parquet-1.15.2 --- .../pxf/automation/features/parquet/ParquetWriteTest.java | 5 +++++ docs/content/hdfs_parquet.md | 4 ++-- server/build.gradle | 6 ++++-- server/gradle.properties | 2 +- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/automation/src/test/java/org/greenplum/pxf/automation/features/parquet/ParquetWriteTest.java b/automation/src/test/java/org/greenplum/pxf/automation/features/parquet/ParquetWriteTest.java index 0a12b1904..58e48588d 100644 --- a/automation/src/test/java/org/greenplum/pxf/automation/features/parquet/ParquetWriteTest.java +++ b/automation/src/test/java/org/greenplum/pxf/automation/features/parquet/ParquetWriteTest.java @@ -210,6 +210,11 @@ public void parquetWritePrimitivesZStd() throws Exception { runWritePrimitivesScenario("pxf_parquet_write_primitives_zstd", "pxf_parquet_read_primitives_zstd", "parquet_write_primitives_zstd", new String[]{"COMPRESSION_CODEC=zstd"}); } + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesLZ4_RAW() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_lz4_raw", "pxf_parquet_read_primitives_lz4_raw", "parquet_write_primitives_lz4_raw", new String[]{"COMPRESSION_CODEC=lz4_raw"}); + } + // Numeric precision not defined, test writing data precision in [1, 38]. All the data should be written correctly. @Test(groups = {"features", "gpdb", "security", "hcfs"}) public void parquetWriteUndefinedPrecisionNumeric() throws Exception { diff --git a/docs/content/hdfs_parquet.md b/docs/content/hdfs_parquet.md index 9ad05b785..856d9fbf6 100644 --- a/docs/content/hdfs_parquet.md +++ b/docs/content/hdfs_parquet.md @@ -23,7 +23,7 @@ under the License. Use the PXF HDFS connector to read and write Parquet-format data. This section describes how to read and write HDFS files that are stored in Parquet format, including how to create, query, and insert into external tables that reference files in the HDFS data store. -PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, and `zstd`. +PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, 'lz4_raw' and `zstd`. PXF currently supports reading and writing primitive Parquet data types only. @@ -182,7 +182,7 @@ The PXF `hdfs:parquet` profile supports encoding- and compression-related write | Write Option | Value Description | |-------|-------------------------------------| -| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `zstd`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | +| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `lz4_raw`, `zstd`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | | ROWGROUP_SIZE | A Parquet file consists of one or more row groups, a logical partitioning of the data into rows. `ROWGROUP_SIZE` identifies the size (in bytes) of the row group. The default row group size is `8 * 1024 * 1024` bytes. | | PAGE_SIZE | A row group consists of column chunks that are divided up into pages. `PAGE_SIZE` is the size (in bytes) of such a page. The default page size is `1 * 1024 * 1024` bytes. | | ENABLE\_DICTIONARY | A boolean value that specifies whether or not to enable dictionary encoding. The default value is `true`; dictionary encoding is enabled when PXF writes Parquet files. | diff --git a/server/build.gradle b/server/build.gradle index 5a9d00314..012e866d4 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -92,7 +92,7 @@ configure(javaProjects) { dependency("commons-lang:commons-lang:2.6") dependency("commons-lang:commons-lang3:3.9") dependency("commons-logging:commons-logging:1.1.3") - dependency("io.airlift:aircompressor:0.27") + dependency("io.airlift:aircompressor:2.0.2") dependency("javax.jdo:jdo-api:3.0.1") dependency("joda-time:joda-time:2.8.1") dependency("net.sf.opencsv:opencsv:2.3") @@ -118,7 +118,7 @@ configure(javaProjects) { dependency("org.threeten:threeten-extra:1.5.0") dependency("org.tukaani:xz:1.8") dependency("org.wildfly.openssl:wildfly-openssl:1.0.7.Final") - dependency("org.xerial.snappy:snappy-java:1.1.10.4") + dependency("org.xerial.snappy:snappy-java:1.1.10.7") // Hadoop dependencies dependencySet(group:"org.apache.hadoop", version:"${hadoopVersion}") { @@ -208,6 +208,8 @@ configure(javaProjects) { entry("avro") entry("avro-mapred") } + // Zstd support for Avro/Parquet + dependency("com.github.luben:zstd-jni:1.5.7-6") // Jackson 1.x dependencies dependencySet(group:"org.codehaus.jackson", version:"1.9.13") { diff --git a/server/gradle.properties b/server/gradle.properties index ee2bba966..801714e8e 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -23,7 +23,7 @@ hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 hbaseVersion=2.3.7 junitVersion=4.11 -parquetVersion=1.12.3 +parquetVersion=1.15.2 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true