diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 676bc445238..cca1f32a453 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -262,9 +262,9 @@ class ParquetFile: page_checksum_verification : bool, default False If True, verify the checksum for each page read from the file. arrow_extensions_enabled : bool, default True - If True, read Parquet logical types as Arrow extension types where possible, - (e.g., read JSON as the canonical `arrow.json` extension type or UUID as - the canonical `arrow.uuid` extension type). + If True, read Parquet logical types as Arrow extension types where + possible (e.g., read JSON as the canonical `arrow.json` extension type + or UUID as the canonical `arrow.uuid` extension type). Examples -------- @@ -2332,7 +2332,7 @@ def write_metadata(schema, where, metadata_collector=None, filesystem=None, def read_metadata(where, memory_map=False, decryption_properties=None, - filesystem=None): + filesystem=None, arrow_extensions_enabled=True): """ Read FileMetaData from footer of a single Parquet file. @@ -2347,6 +2347,10 @@ def read_metadata(where, memory_map=False, decryption_properties=None, If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. + arrow_extensions_enabled : bool, default True + If True, read Parquet logical types as Arrow extension types where + possible (e.g. UUID as the canonical `arrow.uuid` extension type). + If False, use the underlying storage types instead. Returns ------- @@ -2376,13 +2380,17 @@ def read_metadata(where, memory_map=False, decryption_properties=None, file_ctx = where = filesystem.open_input_file(where) with file_ctx: - file = ParquetFile(where, memory_map=memory_map, - decryption_properties=decryption_properties) + file = ParquetFile( + where, + memory_map=memory_map, + decryption_properties=decryption_properties, + arrow_extensions_enabled=arrow_extensions_enabled, + ) return file.metadata def read_schema(where, memory_map=False, decryption_properties=None, - filesystem=None): + filesystem=None, arrow_extensions_enabled=True): """ Read effective Arrow schema from Parquet file metadata. @@ -2397,6 +2405,9 @@ def read_schema(where, memory_map=False, decryption_properties=None, If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. + arrow_extensions_enabled : bool, default True + If True, read Parquet logical types as Arrow extension types where + possible (e.g., UUID as the canonical `arrow.uuid` extension type). Returns ------- @@ -2422,9 +2433,12 @@ def read_schema(where, memory_map=False, decryption_properties=None, with file_ctx: file = ParquetFile( - where, memory_map=memory_map, - decryption_properties=decryption_properties) - return file.schema.to_arrow_schema() + where, + memory_map=memory_map, + decryption_properties=decryption_properties, + arrow_extensions_enabled=arrow_extensions_enabled, + ) + return file.schema_arrow __all__ = ( diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 148bfebaa67..ae7dba2d2a1 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -814,3 +814,22 @@ def msg(c): with pytest.raises(TypeError, match=msg("FileMetaData")): pq.FileMetaData() + + +def test_read_schema_uuid_extension_type(tmp_path): + data = [ + b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb', + b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf', + None, + ] + table = pa.table([pa.array(data, type=pa.uuid())], names=["ext"]) + + file_path = tmp_path / "uuid.parquet" + file_path_str = str(file_path) + pq.write_table(table, file_path_str, store_schema=False) + + schema_default = pq.read_schema(file_path_str) + assert schema_default.field("ext").type == pa.uuid() + + schema_disabled = pq.read_schema(file_path_str, arrow_extensions_enabled=False) + assert schema_disabled.field("ext").type == pa.binary(16)