diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 48262b68077..fec04c182a6 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -476,8 +476,8 @@ You can find the official list of canonical extension types in the :ref:`format_canonical_extensions` section. Here we add examples on how to use them in PyArrow. -Fixed size tensor -""""""""""""""""" +Fixed shape tensor +"""""""""""""""""" To create an array of tensors with equal shape (fixed shape tensor array) we first need to define a fixed shape tensor extension type with value type @@ -487,7 +487,7 @@ and shape: >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), (2, 2)) -Then we need the storage array with :func:`pyarrow.list_` type where ``value_type``` +Then we need the storage array with :func:`pyarrow.list_` type where ``value_type`` is the fixed shape tensor value type and list size is a product of ``tensor_type`` shape elements. Then we can create an array of tensors with ``pa.ExtensionArray.from_storage()`` method: @@ -629,3 +629,41 @@ for ``NCHW`` format where: * C: number of channels of the image * H: height of the image * W: width of the image + +UUID +"""" + +The UUID extension type (``arrow.uuid``) represents universally unique +identifiers as 16-byte fixed-size binary values. PyArrow provides integration +with Python's built-in :mod:`uuid` module, including automatic type inference. + +Creating UUID scalars and arrays +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +PyArrow infers the UUID type from Python's ``uuid.UUID`` objects, +so you can pass them directly to :func:`pyarrow.scalar` and :func:`pyarrow.array`: + +.. code-block:: python + + >>> import uuid + >>> import pyarrow as pa + + >>> pa.scalar(uuid.uuid4()) + + + >>> uuids = [uuid.uuid4() for _ in range(3)] + >>> arr = pa.array(uuids) + >>> arr.type + UuidType(extension) + +You can also explicitly specify the UUID type using :func:`pyarrow.uuid`: + +.. code-block:: python + + >>> pa.array([uuid.uuid4(), uuid.uuid4()], type=pa.uuid()) + + [ + ..., + ... + ] + diff --git a/python/pyarrow/src/arrow/python/common.h b/python/pyarrow/src/arrow/python/common.h index affefe2859b..a81782330b5 100644 --- a/python/pyarrow/src/arrow/python/common.h +++ b/python/pyarrow/src/arrow/python/common.h @@ -419,6 +419,20 @@ struct PyBytesView { return Status::OK(); } + // Parse bytes from a uuid.UUID object (stores reference to keep bytes alive) + Status ParseUuid(PyObject* obj) { + ref.reset(PyObject_GetAttrString(obj, "bytes")); + RETURN_IF_PYERROR(); + if (!PyBytes_Check(ref.obj())) { + return Status::TypeError("Expected uuid.UUID.bytes to return bytes, got '", + Py_TYPE(ref.obj())->tp_name, "' object"); + } + bytes = PyBytes_AS_STRING(ref.obj()); + size = PyBytes_GET_SIZE(ref.obj()); + is_utf8 = false; + return Status::OK(); + } + protected: OwnedRef ref; }; diff --git a/python/pyarrow/src/arrow/python/helpers.cc b/python/pyarrow/src/arrow/python/helpers.cc index 0a24b259310..3515455d224 100644 --- a/python/pyarrow/src/arrow/python/helpers.cc +++ b/python/pyarrow/src/arrow/python/helpers.cc @@ -296,16 +296,69 @@ bool PyFloat_IsNaN(PyObject* obj) { namespace { -// This needs a conditional, because using std::once_flag could introduce -// a deadlock when the GIL is enabled. See -// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 for -// more info. +// Thread-safe one-time Python module import + attribute lookup. For Pandas and UUID. +// Uses std::call_once when the GIL is disabled, or a simple boolean flag when +// the GIL is enabled to avoid deadlocks. See ARROW-10519 for more details and +// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 +struct ModuleOnceRunner { + std::string module_name; #ifdef Py_GIL_DISABLED -static std::once_flag pandas_static_initialized; + std::once_flag initialized; #else -static bool pandas_static_initialized = false; + bool initialized = false; #endif + explicit ModuleOnceRunner(const std::string& module_name) : module_name(module_name) {} + + template + void RunOnce(Func&& func) { + auto do_init = [&]() { + OwnedRef module; + if (ImportModule(module_name, &module).ok()) { +#ifndef Py_GIL_DISABLED + // Since ImportModule can release the GIL, another thread could have + // already initialized the static data. + if (initialized) { + return; + } +#endif + func(module); + } + }; +#ifdef Py_GIL_DISABLED + std::call_once(initialized, do_init); +#else + if (!initialized) { + do_init(); + initialized = true; + } +#endif + } +}; + +static PyObject* uuid_UUID = nullptr; +static ModuleOnceRunner uuid_runner("uuid"); + +} // namespace + +bool IsPyUuid(PyObject* obj) { + uuid_runner.RunOnce([](OwnedRef& module) { + OwnedRef ref; + if (ImportFromModule(module.obj(), "UUID", &ref).ok()) { + uuid_UUID = ref.obj(); + } + }); + if (!uuid_UUID) return false; + int result = PyObject_IsInstance(obj, uuid_UUID); + if (result < 0) { + PyErr_Clear(); + return false; + } + return result != 0; +} + +namespace { + // Once initialized, these variables hold borrowed references to Pandas static data. // We should not use OwnedRef here because Python destructors would be // called on a finalized interpreter. @@ -315,72 +368,43 @@ static PyObject* pandas_Timedelta = nullptr; static PyObject* pandas_Timestamp = nullptr; static PyTypeObject* pandas_NaTType = nullptr; static PyObject* pandas_DateOffset = nullptr; +static ModuleOnceRunner pandas_runner("pandas"); -void GetPandasStaticSymbols() { - OwnedRef pandas; - - // Import pandas - Status s = ImportModule("pandas", &pandas); - if (!s.ok()) { - return; - } - -#ifndef Py_GIL_DISABLED - // Since ImportModule can release the GIL, another thread could have - // already initialized the static data. - if (pandas_static_initialized) { - return; - } -#endif - - OwnedRef ref; - - // set NaT sentinel and its type - if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) { - pandas_NaT = ref.obj(); - // PyObject_Type returns a new reference but we trust that pandas.NaT will - // outlive our use of this PyObject* - pandas_NaTType = Py_TYPE(ref.obj()); - } - - // retain a reference to Timedelta - if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) { - pandas_Timedelta = ref.obj(); - } +} // namespace - // retain a reference to Timestamp - if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) { - pandas_Timestamp = ref.obj(); - } +void InitPandasStaticData() { + pandas_runner.RunOnce([](OwnedRef& module) { + OwnedRef ref; + + // set NaT sentinel and its type + if (ImportFromModule(module.obj(), "NaT", &ref).ok()) { + pandas_NaT = ref.obj(); + // PyObject_Type returns a new reference but we trust that pandas.NaT will + // outlive our use of this PyObject* + pandas_NaTType = Py_TYPE(ref.obj()); + } - // if pandas.NA exists, retain a reference to it - if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) { - pandas_NA = ref.obj(); - } + // retain a reference to Timedelta + if (ImportFromModule(module.obj(), "Timedelta", &ref).ok()) { + pandas_Timedelta = ref.obj(); + } - // Import DateOffset type - if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) { - pandas_DateOffset = ref.obj(); - } -} + // retain a reference to Timestamp + if (ImportFromModule(module.obj(), "Timestamp", &ref).ok()) { + pandas_Timestamp = ref.obj(); + } -} // namespace + // if pandas.NA exists, retain a reference to it + if (ImportFromModule(module.obj(), "NA", &ref).ok()) { + pandas_NA = ref.obj(); + } -#ifdef Py_GIL_DISABLED -void InitPandasStaticData() { - std::call_once(pandas_static_initialized, GetPandasStaticSymbols); -} -#else -void InitPandasStaticData() { - // NOTE: This is called with the GIL held. We needn't (and shouldn't, - // to avoid deadlocks) use an additional C++ lock (ARROW-10519). - if (pandas_static_initialized) { - return; - } - GetPandasStaticSymbols(); - pandas_static_initialized = true; + // Import DateOffset type + if (ImportFromModule(module.obj(), "DateOffset", &ref).ok()) { + pandas_DateOffset = ref.obj(); + } + }); } -#endif bool PandasObjectIsNull(PyObject* obj) { if (!MayHaveNaN(obj)) { diff --git a/python/pyarrow/src/arrow/python/helpers.h b/python/pyarrow/src/arrow/python/helpers.h index b0cf1010289..b4417a96441 100644 --- a/python/pyarrow/src/arrow/python/helpers.h +++ b/python/pyarrow/src/arrow/python/helpers.h @@ -92,6 +92,10 @@ PyObject* BorrowPandasDataOffsetType(); ARROW_PYTHON_EXPORT bool PyFloat_IsNaN(PyObject* obj); +// \brief Check whether obj is a uuid.UUID instance +ARROW_PYTHON_EXPORT +bool IsPyUuid(PyObject* obj); + inline bool IsPyBinary(PyObject* obj) { return PyBytes_Check(obj) || PyByteArray_Check(obj) || PyMemoryView_Check(obj); } diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc index 06cb4694831..291cc421495 100644 --- a/python/pyarrow/src/arrow/python/inference.cc +++ b/python/pyarrow/src/arrow/python/inference.cc @@ -27,6 +27,7 @@ #include #include +#include "arrow/extension/uuid.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/util/decimal.h" @@ -407,6 +408,7 @@ class TypeInferrer { arrow_scalar_count_(0), numpy_dtype_count_(0), interval_count_(0), + uuid_count_(0), max_decimal_metadata_(std::numeric_limits::min(), std::numeric_limits::min()), decimal_type_() { @@ -475,6 +477,9 @@ class TypeInferrer { ++decimal_count_; } else if (PyObject_IsInstance(obj, interval_types_.obj())) { ++interval_count_; + } else if (internal::IsPyUuid(obj)) { + ++uuid_count_; + *keep_going = make_unions_; } else { return internal::InvalidValue(obj, "did not recognize Python value type when inferring " @@ -604,6 +609,8 @@ class TypeInferrer { *out = utf8(); } else if (interval_count_) { *out = month_day_nano_interval(); + } else if (uuid_count_) { + *out = extension::uuid(); } else if (arrow_scalar_count_) { *out = scalar_type_; } else { @@ -766,6 +773,7 @@ class TypeInferrer { int64_t arrow_scalar_count_; int64_t numpy_dtype_count_; int64_t interval_count_; + int64_t uuid_count_; std::unique_ptr list_inferrer_; std::vector> struct_inferrers_; std::unordered_map struct_field_index_; diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 139eb1d7f4f..c70510a4800 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -36,6 +36,7 @@ #include "arrow/array/builder_primitive.h" #include "arrow/array/builder_time.h" #include "arrow/chunked_array.h" +#include "arrow/extension_type.h" #include "arrow/result.h" #include "arrow/scalar.h" #include "arrow/status.h" @@ -512,7 +513,12 @@ class PyValue { static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, PyBytesView& view) { - ARROW_RETURN_NOT_OK(view.ParseString(obj)); + // Check if obj is a uuid.UUID instance + if (internal::IsPyUuid(obj)) { + ARROW_RETURN_NOT_OK(view.ParseUuid(obj)); + } else { + ARROW_RETURN_NOT_OK(view.ParseString(obj)); + } if (view.size != type->byte_width()) { std::stringstream ss; ss << "expected to be length " << type->byte_width() << " was " << view.size; @@ -1268,9 +1274,16 @@ Result> ConvertPySequence(PyObject* obj, PyObject* // In some cases, type inference may be "loose", like strings. If the user // passed pa.string(), then we will error if we encounter any non-UTF8 // value. If not, then we will allow the result to be a BinaryArray + std::shared_ptr extension_type; if (options.type == nullptr) { ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas)); options.strict = false; + // If type inference returned an extension type, convert using + // the storage type and then wrap the result as an extension array + if (options.type->id() == Type::EXTENSION) { + extension_type = options.type; + options.type = checked_cast(*options.type).storage_type(); + } } else { options.strict = true; } @@ -1278,6 +1291,7 @@ Result> ConvertPySequence(PyObject* obj, PyObject* ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( options.type, options, pool))); + std::shared_ptr result; if (converter->may_overflow()) { // The converter hierarchy contains binary- or list-like builders which can overflow // depending on the input values. Wrap the converter with a chunker which detects @@ -1288,7 +1302,7 @@ Result> ConvertPySequence(PyObject* obj, PyObject* } else { RETURN_NOT_OK(chunked_converter->Extend(seq, size)); } - return chunked_converter->ToChunkedArray(); + ARROW_ASSIGN_OR_RAISE(result, chunked_converter->ToChunkedArray()); } else { // If the converter can't overflow spare the capacity error checking on the hot-path, // this improves the performance roughly by ~10% for primitive types. @@ -1297,8 +1311,13 @@ Result> ConvertPySequence(PyObject* obj, PyObject* } else { RETURN_NOT_OK(converter->Extend(seq, size)); } - return converter->ToChunkedArray(); + ARROW_ASSIGN_OR_RAISE(result, converter->ToChunkedArray()); + } + // If we inferred an extension type, wrap as an extension array + if (extension_type != nullptr) { + return ExtensionType::WrapArray(extension_type, result); } + return result; } } // namespace py diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index ebac37e862b..bb02fe8a694 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1399,6 +1399,91 @@ def test_uuid_extension(): assert isinstance(array[0], pa.UuidScalar) +def test_uuid_scalar_from_python(): + import uuid + + # Test with explicit type + py_uuid = uuid.uuid4() + scalar = pa.scalar(py_uuid, type=pa.uuid()) + assert isinstance(scalar, pa.UuidScalar) + assert scalar.type == pa.uuid() + assert scalar.as_py() == py_uuid + + # Test with specific UUID value + specific_uuid = UUID("12345678-1234-5678-1234-567812345678") + scalar = pa.scalar(specific_uuid, type=pa.uuid()) + assert scalar.as_py() == specific_uuid + assert scalar.value.as_py() == specific_uuid.bytes + + scalar = pa.scalar(None, type=pa.uuid()) + assert scalar.is_valid is False + assert scalar.as_py() is None + + # Test type inference from uuid.UUID + py_uuid = uuid.uuid4() + scalar = pa.scalar(py_uuid) + assert isinstance(scalar, pa.UuidScalar) + assert scalar.type == pa.uuid() + assert scalar.as_py() == py_uuid + + +def test_uuid_array_from_python(): + import uuid + + # Test array with explicit type + uuids = [uuid.uuid4() for _ in range(3)] + uuids.append(None) + + arr = pa.array(uuids, type=pa.uuid()) + assert arr.type == pa.uuid() + assert len(arr) == 4 + assert arr.null_count == 1 + for i, u in enumerate(uuids): + assert arr[i].as_py() == u + + # Test type inference for arrays + arr = pa.array(uuids) + assert arr.type == pa.uuid() + for i, u in enumerate(uuids): + assert arr[i].as_py() == u + + +@pytest.mark.parametrize("bytes_value,exc_type,match", [ + (b"0123456789abcde", pa.ArrowInvalid, "expected to be length 16 was 15"), + ( + "0123456789abcdef", TypeError, + "Expected uuid.UUID.bytes to return bytes, got 'str'" + ), + (None, TypeError, "Expected uuid.UUID.bytes to return bytes, got 'NoneType'"), +]) +def test_uuid_bytes_property_not_bytes(bytes_value, exc_type, match): + import uuid + + class BadUuid(uuid.UUID): + @property + def bytes(self): + return bytes_value + + bad = BadUuid(uuid.uuid4().hex) + with pytest.raises(exc_type, match=match): + pa.array([bad], type=pa.uuid()) + with pytest.raises(exc_type, match=match): + pa.scalar(bad, type=pa.uuid()) + + +def test_uuid_bytes_property_raises(): + import uuid + + class BadUuid(uuid.UUID): + @property + def bytes(self): + raise RuntimeError("broken") + + bad = BadUuid(uuid.uuid4().hex) + with pytest.raises(RuntimeError, match="broken"): + pa.array([bad], type=pa.uuid()) + + def test_tensor_type(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor"