From 8ee7aeb3b1184228774524242af248fbed5b3102 Mon Sep 17 00:00:00 2001 From: Nicolas Vandeginste Date: Tue, 25 Nov 2025 17:45:35 +0100 Subject: [PATCH 1/2] GH-48254: [Python][Parquet] Add read_schema UUID extension regression test --- .../pyarrow/tests/parquet/test_data_types.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index c546bc1532a..571343472ac 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -604,6 +604,25 @@ def test_uuid_extension_type(): store_schema=False) +def test_read_schema_uuid_extension_type(tmp_path): + data = [ + b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb', + b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf', + None, + ] + table = pa.table([pa.array(data, type=pa.uuid())], names=["ext"]) + + file_path = tmp_path / "uuid.parquet" + file_path_str = str(file_path) + pq.write_table(table, file_path_str, store_schema=False) + + schema_default = pq.read_schema(file_path_str) + assert schema_default.field("ext").type == pa.uuid() + + schema_disabled = pq.read_schema(file_path_str, arrow_extensions_enabled=False) + assert schema_disabled.field("ext").type == pa.binary(16) + + def test_undefined_logical_type(parquet_test_datadir): test_file = f"{parquet_test_datadir}/unknown-logical-type.parquet" From 966df38a70dd553fa8632ed39c765fc0300a148e Mon Sep 17 00:00:00 2001 From: Nicolas Vandeginste Date: Tue, 25 Nov 2025 17:45:48 +0100 Subject: [PATCH 2/2] GH-48254: [Python][Parquet] Support extension types in read_schema --- python/pyarrow/parquet/core.py | 34 +++++++++++++------ .../pyarrow/tests/parquet/test_data_types.py | 19 ----------- python/pyarrow/tests/parquet/test_metadata.py | 19 +++++++++++ 3 files changed, 43 insertions(+), 29 deletions(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 676bc445238..cca1f32a453 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -262,9 +262,9 @@ class ParquetFile: page_checksum_verification : bool, default False If True, verify the checksum for each page read from the file. arrow_extensions_enabled : bool, default True - If True, read Parquet logical types as Arrow extension types where possible, - (e.g., read JSON as the canonical `arrow.json` extension type or UUID as - the canonical `arrow.uuid` extension type). + If True, read Parquet logical types as Arrow extension types where + possible (e.g., read JSON as the canonical `arrow.json` extension type + or UUID as the canonical `arrow.uuid` extension type). Examples -------- @@ -2332,7 +2332,7 @@ def write_metadata(schema, where, metadata_collector=None, filesystem=None, def read_metadata(where, memory_map=False, decryption_properties=None, - filesystem=None): + filesystem=None, arrow_extensions_enabled=True): """ Read FileMetaData from footer of a single Parquet file. @@ -2347,6 +2347,10 @@ def read_metadata(where, memory_map=False, decryption_properties=None, If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. + arrow_extensions_enabled : bool, default True + If True, read Parquet logical types as Arrow extension types where + possible (e.g. UUID as the canonical `arrow.uuid` extension type). + If False, use the underlying storage types instead. Returns ------- @@ -2376,13 +2380,17 @@ def read_metadata(where, memory_map=False, decryption_properties=None, file_ctx = where = filesystem.open_input_file(where) with file_ctx: - file = ParquetFile(where, memory_map=memory_map, - decryption_properties=decryption_properties) + file = ParquetFile( + where, + memory_map=memory_map, + decryption_properties=decryption_properties, + arrow_extensions_enabled=arrow_extensions_enabled, + ) return file.metadata def read_schema(where, memory_map=False, decryption_properties=None, - filesystem=None): + filesystem=None, arrow_extensions_enabled=True): """ Read effective Arrow schema from Parquet file metadata. @@ -2397,6 +2405,9 @@ def read_schema(where, memory_map=False, decryption_properties=None, If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. + arrow_extensions_enabled : bool, default True + If True, read Parquet logical types as Arrow extension types where + possible (e.g., UUID as the canonical `arrow.uuid` extension type). Returns ------- @@ -2422,9 +2433,12 @@ def read_schema(where, memory_map=False, decryption_properties=None, with file_ctx: file = ParquetFile( - where, memory_map=memory_map, - decryption_properties=decryption_properties) - return file.schema.to_arrow_schema() + where, + memory_map=memory_map, + decryption_properties=decryption_properties, + arrow_extensions_enabled=arrow_extensions_enabled, + ) + return file.schema_arrow __all__ = ( diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 571343472ac..c546bc1532a 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -604,25 +604,6 @@ def test_uuid_extension_type(): store_schema=False) -def test_read_schema_uuid_extension_type(tmp_path): - data = [ - b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb', - b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf', - None, - ] - table = pa.table([pa.array(data, type=pa.uuid())], names=["ext"]) - - file_path = tmp_path / "uuid.parquet" - file_path_str = str(file_path) - pq.write_table(table, file_path_str, store_schema=False) - - schema_default = pq.read_schema(file_path_str) - assert schema_default.field("ext").type == pa.uuid() - - schema_disabled = pq.read_schema(file_path_str, arrow_extensions_enabled=False) - assert schema_disabled.field("ext").type == pa.binary(16) - - def test_undefined_logical_type(parquet_test_datadir): test_file = f"{parquet_test_datadir}/unknown-logical-type.parquet" diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 148bfebaa67..ae7dba2d2a1 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -814,3 +814,22 @@ def msg(c): with pytest.raises(TypeError, match=msg("FileMetaData")): pq.FileMetaData() + + +def test_read_schema_uuid_extension_type(tmp_path): + data = [ + b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb', + b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf', + None, + ] + table = pa.table([pa.array(data, type=pa.uuid())], names=["ext"]) + + file_path = tmp_path / "uuid.parquet" + file_path_str = str(file_path) + pq.write_table(table, file_path_str, store_schema=False) + + schema_default = pq.read_schema(file_path_str) + assert schema_default.field("ext").type == pa.uuid() + + schema_disabled = pq.read_schema(file_path_str, arrow_extensions_enabled=False) + assert schema_disabled.field("ext").type == pa.binary(16)