From 5591212fdbd7fe68e6b3c1f3eb8c55a7ddb96e6b Mon Sep 17 00:00:00 2001 From: Dan Cocking Date: Thu, 14 Aug 2025 11:01:39 +1000 Subject: [PATCH] SHEN-16540 Add write page index option to ParquetWriter Currently, users do not have the option to write the page index when writing parquet files. This restricts user side optimisation as it prevents page level stats from being persisted. Let's pipe through this option as a table property so that users can write page level indexes. --- pyiceberg/io/pyarrow.py | 5 +++++ pyiceberg/table/__init__.py | 3 +++ 2 files changed, 8 insertions(+) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index d81d2bf6c3..32ad2c96c8 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -2570,6 +2570,11 @@ def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]: property_name=TableProperties.PARQUET_PAGE_ROW_LIMIT, default=TableProperties.PARQUET_PAGE_ROW_LIMIT_DEFAULT, ), + "write_page_index": property_as_bool( + properties=table_properties, + property_name=TableProperties.PARQUET_WRITE_PAGE_INDEX, + default=TableProperties.PARQUET_WRITE_PAGE_INDEX_DEFAULT, + ) } diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index f81a767443..159cae32bc 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -201,6 +201,9 @@ class TableProperties: PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX = "write.parquet.bloom-filter-enabled.column" + PARQUET_WRITE_PAGE_INDEX = "write.parquet.write-page-index" + PARQUET_WRITE_PAGE_INDEX_DEFAULT = False + WRITE_TARGET_FILE_SIZE_BYTES = "write.target-file-size-bytes" WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT = 512 * 1024 * 1024 # 512 MB