Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions docs/source/python/extending_types.rst
Original file line number Diff line number Diff line change
Expand Up @@ -476,8 +476,8 @@ You can find the official list of canonical extension types in the
:ref:`format_canonical_extensions` section. Here we add examples on how to
use them in PyArrow.

Fixed size tensor
"""""""""""""""""
Fixed shape tensor
""""""""""""""""""

To create an array of tensors with equal shape (fixed shape tensor array) we
first need to define a fixed shape tensor extension type with value type
Expand All @@ -487,7 +487,7 @@ and shape:

>>> tensor_type = pa.fixed_shape_tensor(pa.int32(), (2, 2))

Then we need the storage array with :func:`pyarrow.list_` type where ``value_type```
Then we need the storage array with :func:`pyarrow.list_` type where ``value_type``
is the fixed shape tensor value type and list size is a product of ``tensor_type``
shape elements. Then we can create an array of tensors with
``pa.ExtensionArray.from_storage()`` method:
Expand Down Expand Up @@ -629,3 +629,41 @@ for ``NCHW`` format where:
* C: number of channels of the image
* H: height of the image
* W: width of the image

UUID
""""

The UUID extension type (``arrow.uuid``) represents universally unique
identifiers as 16-byte fixed-size binary values. PyArrow provides integration
with Python's built-in :mod:`uuid` module, including automatic type inference.

Creating UUID scalars and arrays
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

PyArrow infers the UUID type from Python's ``uuid.UUID`` objects,
so you can pass them directly to :func:`pyarrow.scalar` and :func:`pyarrow.array`:

.. code-block:: python

>>> import uuid
>>> import pyarrow as pa

>>> pa.scalar(uuid.uuid4())
<pyarrow.UuidScalar: UUID('...')>

>>> uuids = [uuid.uuid4() for _ in range(3)]
>>> arr = pa.array(uuids)
>>> arr.type
UuidType(extension<arrow.uuid>)

You can also explicitly specify the UUID type using :func:`pyarrow.uuid`:

.. code-block:: python

>>> pa.array([uuid.uuid4(), uuid.uuid4()], type=pa.uuid())
<pyarrow.lib.UuidArray object at ...>
[
...,
...
]

10 changes: 10 additions & 0 deletions python/pyarrow/src/arrow/python/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,16 @@ struct PyBytesView {
return Status::OK();
}

// Parse bytes from a uuid.UUID object (stores reference to keep bytes alive)
Status ParseUuid(PyObject* obj) {
ref.reset(PyObject_GetAttrString(obj, "bytes"));
RETURN_IF_PYERROR();
bytes = PyBytes_AS_STRING(ref.obj());
size = PyBytes_GET_SIZE(ref.obj());
is_utf8 = false;
return Status::OK();
}

protected:
OwnedRef ref;
};
Expand Down
154 changes: 89 additions & 65 deletions python/pyarrow/src/arrow/python/helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -296,16 +296,69 @@ bool PyFloat_IsNaN(PyObject* obj) {

namespace {

// This needs a conditional, because using std::once_flag could introduce
// a deadlock when the GIL is enabled. See
// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 for
// more info.
// Thread-safe one-time Python module import + attribute lookup. For Pandas and UUID.
// Uses std::call_once when the GIL is disabled, or a simple boolean flag when
// the GIL is enabled to avoid deadlocks. See ARROW-10519 for more details and
// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272
struct ModuleOnceRunner {
std::string module_name;
#ifdef Py_GIL_DISABLED
static std::once_flag pandas_static_initialized;
std::once_flag initialized;
#else
static bool pandas_static_initialized = false;
bool initialized = false;
#endif

explicit ModuleOnceRunner(const std::string& module_name) : module_name(module_name) {}

template <typename Func>
void RunOnce(Func&& func) {
auto do_init = [&]() {
OwnedRef module;
if (ImportModule(module_name, &module).ok()) {
#ifndef Py_GIL_DISABLED
// Since ImportModule can release the GIL, another thread could have
// already initialized the static data.
if (initialized) {
return;
}
#endif
func(module);
}
};
#ifdef Py_GIL_DISABLED
std::call_once(initialized, do_init);
#else
if (!initialized) {
do_init();
initialized = true;
}
#endif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're duplicating code here between different module imports. It would be really nice to write something like this:

struct UuidModuleData {
  PyObject* UUID_class = nullptr;
};

UuidModuleData* InitUuidStaticData() {
  static ModuleOnceRunner runner("uuid");
  return runner.Run([&](OwnedRef module) -> UuidModuleData {
    UuidModuleData data;
    OwnedRef ref;
    if (ImportFromModule(module.obj(), "UUID", &ref).ok()) {
      data.UUID_class = ref.obj();
    }
    return data;
  });
}
struct ModuleOnceRunner {
  std::string module_name;
#ifdef Py_GIL_DISABLED
  std::once_flag initialized;
#else
  bool initialized = false;
#endif

  template <typename Func>
  auto Run(Func&& func) -> decltype(func(OwnedRef()) {
    using RetType = decltype(func(OwnedRef());
    RetType ret{};
    auto wrapper_func = [&]() {
      OwnerRef module;
      if (ImportModule("uuid", &module).ok()) {
        ret = func(std::move(module));
      }
    };
#ifdef Py_GIL_DISABLED
    std::call_once(initialized, wrapper_func);
#else
    if (!initialized) {
      initialized = true;
      wrapper_func();
    }
#endif
    return ret;
  };
};

I think @rok can help.

}
};

static PyObject* uuid_UUID = nullptr;
static ModuleOnceRunner uuid_runner("uuid");

} // namespace

bool IsPyUuid(PyObject* obj) {
uuid_runner.RunOnce([](OwnedRef& module) {
OwnedRef ref;
if (ImportFromModule(module.obj(), "UUID", &ref).ok()) {
uuid_UUID = ref.obj();
}
});
if (!uuid_UUID) return false;
int result = PyObject_IsInstance(obj, uuid_UUID);
if (result < 0) {
PyErr_Clear();
return false;
}
return result != 0;
}

namespace {

// Once initialized, these variables hold borrowed references to Pandas static data.
// We should not use OwnedRef here because Python destructors would be
// called on a finalized interpreter.
Expand All @@ -315,72 +368,43 @@ static PyObject* pandas_Timedelta = nullptr;
static PyObject* pandas_Timestamp = nullptr;
static PyTypeObject* pandas_NaTType = nullptr;
static PyObject* pandas_DateOffset = nullptr;
static ModuleOnceRunner pandas_runner("pandas");

void GetPandasStaticSymbols() {
OwnedRef pandas;

// Import pandas
Status s = ImportModule("pandas", &pandas);
if (!s.ok()) {
return;
}

#ifndef Py_GIL_DISABLED
// Since ImportModule can release the GIL, another thread could have
// already initialized the static data.
if (pandas_static_initialized) {
return;
}
#endif

OwnedRef ref;

// set NaT sentinel and its type
if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) {
pandas_NaT = ref.obj();
// PyObject_Type returns a new reference but we trust that pandas.NaT will
// outlive our use of this PyObject*
pandas_NaTType = Py_TYPE(ref.obj());
}

// retain a reference to Timedelta
if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) {
pandas_Timedelta = ref.obj();
}
} // namespace

// retain a reference to Timestamp
if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) {
pandas_Timestamp = ref.obj();
}
void InitPandasStaticData() {
pandas_runner.RunOnce([](OwnedRef& module) {
OwnedRef ref;

// set NaT sentinel and its type
if (ImportFromModule(module.obj(), "NaT", &ref).ok()) {
pandas_NaT = ref.obj();
// PyObject_Type returns a new reference but we trust that pandas.NaT will
// outlive our use of this PyObject*
pandas_NaTType = Py_TYPE(ref.obj());
}

// if pandas.NA exists, retain a reference to it
if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) {
pandas_NA = ref.obj();
}
// retain a reference to Timedelta
if (ImportFromModule(module.obj(), "Timedelta", &ref).ok()) {
pandas_Timedelta = ref.obj();
}

// Import DateOffset type
if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) {
pandas_DateOffset = ref.obj();
}
}
// retain a reference to Timestamp
if (ImportFromModule(module.obj(), "Timestamp", &ref).ok()) {
pandas_Timestamp = ref.obj();
}

} // namespace
// if pandas.NA exists, retain a reference to it
if (ImportFromModule(module.obj(), "NA", &ref).ok()) {
pandas_NA = ref.obj();
}

#ifdef Py_GIL_DISABLED
void InitPandasStaticData() {
std::call_once(pandas_static_initialized, GetPandasStaticSymbols);
}
#else
void InitPandasStaticData() {
// NOTE: This is called with the GIL held. We needn't (and shouldn't,
// to avoid deadlocks) use an additional C++ lock (ARROW-10519).
if (pandas_static_initialized) {
return;
}
GetPandasStaticSymbols();
pandas_static_initialized = true;
// Import DateOffset type
if (ImportFromModule(module.obj(), "DateOffset", &ref).ok()) {
pandas_DateOffset = ref.obj();
}
});
}
#endif

bool PandasObjectIsNull(PyObject* obj) {
if (!MayHaveNaN(obj)) {
Expand Down
4 changes: 4 additions & 0 deletions python/pyarrow/src/arrow/python/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ PyObject* BorrowPandasDataOffsetType();
ARROW_PYTHON_EXPORT
bool PyFloat_IsNaN(PyObject* obj);

// \brief Check whether obj is a uuid.UUID instance
ARROW_PYTHON_EXPORT
bool IsPyUuid(PyObject* obj);

inline bool IsPyBinary(PyObject* obj) {
return PyBytes_Check(obj) || PyByteArray_Check(obj) || PyMemoryView_Check(obj);
}
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/src/arrow/python/inference.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <utility>
#include <vector>

#include "arrow/extension/uuid.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/util/decimal.h"
Expand Down Expand Up @@ -407,6 +408,7 @@ class TypeInferrer {
arrow_scalar_count_(0),
numpy_dtype_count_(0),
interval_count_(0),
uuid_count_(0),
max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
std::numeric_limits<int32_t>::min()),
decimal_type_() {
Expand Down Expand Up @@ -475,6 +477,9 @@ class TypeInferrer {
++decimal_count_;
} else if (PyObject_IsInstance(obj, interval_types_.obj())) {
++interval_count_;
} else if (internal::IsPyUuid(obj)) {
++uuid_count_;
*keep_going = make_unions_;
} else {
return internal::InvalidValue(obj,
"did not recognize Python value type when inferring "
Expand Down Expand Up @@ -604,6 +609,8 @@ class TypeInferrer {
*out = utf8();
} else if (interval_count_) {
*out = month_day_nano_interval();
} else if (uuid_count_) {
*out = extension::uuid();
} else if (arrow_scalar_count_) {
*out = scalar_type_;
} else {
Expand Down Expand Up @@ -766,6 +773,7 @@ class TypeInferrer {
int64_t arrow_scalar_count_;
int64_t numpy_dtype_count_;
int64_t interval_count_;
int64_t uuid_count_;
std::unique_ptr<TypeInferrer> list_inferrer_;
std::vector<std::pair<std::string, TypeInferrer>> struct_inferrers_;
std::unordered_map<std::string, size_t> struct_field_index_;
Expand Down
25 changes: 22 additions & 3 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "arrow/array/builder_primitive.h"
#include "arrow/array/builder_time.h"
#include "arrow/chunked_array.h"
#include "arrow/extension_type.h"
#include "arrow/result.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
Expand Down Expand Up @@ -512,7 +513,12 @@ class PyValue {

static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
PyBytesView& view) {
ARROW_RETURN_NOT_OK(view.ParseString(obj));
// Check if obj is a uuid.UUID instance
if (internal::IsPyUuid(obj)) {
ARROW_RETURN_NOT_OK(view.ParseUuid(obj));
} else {
ARROW_RETURN_NOT_OK(view.ParseString(obj));
}
if (view.size != type->byte_width()) {
std::stringstream ss;
ss << "expected to be length " << type->byte_width() << " was " << view.size;
Expand Down Expand Up @@ -1268,16 +1274,24 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject*
// In some cases, type inference may be "loose", like strings. If the user
// passed pa.string(), then we will error if we encounter any non-UTF8
// value. If not, then we will allow the result to be a BinaryArray
std::shared_ptr<DataType> extension_type;
if (options.type == nullptr) {
ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas));
options.strict = false;
// If type inference returned an extension type, convert using
// the storage type and then wrap the result as an extension array
if (options.type->id() == Type::EXTENSION) {
extension_type = options.type;
options.type = checked_cast<const ExtensionType&>(*options.type).storage_type();
}
} else {
options.strict = true;
}
ARROW_DCHECK_GE(size, 0);

ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter<PyConverter, PyConverterTrait>(
options.type, options, pool)));
Comment on lines 1292 to 1293
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does MakeConverter support extension types here? I see that we only unwrap the extension type in the inference path above.

std::shared_ptr<ChunkedArray> result;
if (converter->may_overflow()) {
// The converter hierarchy contains binary- or list-like builders which can overflow
// depending on the input values. Wrap the converter with a chunker which detects
Expand All @@ -1288,7 +1302,7 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject*
} else {
RETURN_NOT_OK(chunked_converter->Extend(seq, size));
}
return chunked_converter->ToChunkedArray();
ARROW_ASSIGN_OR_RAISE(result, chunked_converter->ToChunkedArray());
} else {
// If the converter can't overflow spare the capacity error checking on the hot-path,
// this improves the performance roughly by ~10% for primitive types.
Expand All @@ -1297,8 +1311,13 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject*
} else {
RETURN_NOT_OK(converter->Extend(seq, size));
}
return converter->ToChunkedArray();
ARROW_ASSIGN_OR_RAISE(result, converter->ToChunkedArray());
}
// If we inferred an extension type, wrap as an extension array
if (extension_type != nullptr) {
return ExtensionType::WrapArray(extension_type, result);
}
return result;
}

} // namespace py
Expand Down
Loading
Loading