-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-48241: [Python] Scalar inferencing doesn't infer UUID #48727
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d137335
18fd6c4
d83fb68
2974e12
2a0f886
e157993
c2f38b1
f5e2c5e
e0007ba
c92bd10
e5815ce
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -296,16 +296,69 @@ bool PyFloat_IsNaN(PyObject* obj) { | |
|
|
||
| namespace { | ||
|
|
||
| // This needs a conditional, because using std::once_flag could introduce | ||
| // a deadlock when the GIL is enabled. See | ||
| // https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 for | ||
| // more info. | ||
| // Thread-safe one-time Python module import + attribute lookup. For Pandas and UUID. | ||
| // Uses std::call_once when the GIL is disabled, or a simple boolean flag when | ||
| // the GIL is enabled to avoid deadlocks. See ARROW-10519 for more details and | ||
| // https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 | ||
| struct ModuleOnceRunner { | ||
| std::string module_name; | ||
| #ifdef Py_GIL_DISABLED | ||
| static std::once_flag pandas_static_initialized; | ||
| std::once_flag initialized; | ||
| #else | ||
| static bool pandas_static_initialized = false; | ||
| bool initialized = false; | ||
| #endif | ||
|
|
||
| explicit ModuleOnceRunner(const std::string& module_name) : module_name(module_name) {} | ||
|
|
||
| template <typename Func> | ||
| void RunOnce(Func&& func) { | ||
| auto do_init = [&]() { | ||
| OwnedRef module; | ||
| if (ImportModule(module_name, &module).ok()) { | ||
| #ifndef Py_GIL_DISABLED | ||
| // Since ImportModule can release the GIL, another thread could have | ||
| // already initialized the static data. | ||
| if (initialized) { | ||
| return; | ||
| } | ||
| #endif | ||
| func(module); | ||
| } | ||
| }; | ||
| #ifdef Py_GIL_DISABLED | ||
| std::call_once(initialized, do_init); | ||
| #else | ||
| if (!initialized) { | ||
| do_init(); | ||
| initialized = true; | ||
| } | ||
| #endif | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're duplicating code here between different module imports. It would be really nice to write something like this: struct UuidModuleData {
PyObject* UUID_class = nullptr;
};
UuidModuleData* InitUuidStaticData() {
static ModuleOnceRunner runner("uuid");
return runner.Run([&](OwnedRef module) -> UuidModuleData {
UuidModuleData data;
OwnedRef ref;
if (ImportFromModule(module.obj(), "UUID", &ref).ok()) {
data.UUID_class = ref.obj();
}
return data;
});
}struct ModuleOnceRunner {
std::string module_name;
#ifdef Py_GIL_DISABLED
std::once_flag initialized;
#else
bool initialized = false;
#endif
template <typename Func>
auto Run(Func&& func) -> decltype(func(OwnedRef()) {
using RetType = decltype(func(OwnedRef());
RetType ret{};
auto wrapper_func = [&]() {
OwnerRef module;
if (ImportModule("uuid", &module).ok()) {
ret = func(std::move(module));
}
};
#ifdef Py_GIL_DISABLED
std::call_once(initialized, wrapper_func);
#else
if (!initialized) {
initialized = true;
wrapper_func();
}
#endif
return ret;
};
};
I think @rok can help. |
||
| } | ||
| }; | ||
|
|
||
| static PyObject* uuid_UUID = nullptr; | ||
| static ModuleOnceRunner uuid_runner("uuid"); | ||
|
|
||
| } // namespace | ||
|
|
||
| bool IsPyUuid(PyObject* obj) { | ||
| uuid_runner.RunOnce([](OwnedRef& module) { | ||
| OwnedRef ref; | ||
| if (ImportFromModule(module.obj(), "UUID", &ref).ok()) { | ||
| uuid_UUID = ref.obj(); | ||
| } | ||
| }); | ||
| if (!uuid_UUID) return false; | ||
| int result = PyObject_IsInstance(obj, uuid_UUID); | ||
| if (result < 0) { | ||
| PyErr_Clear(); | ||
| return false; | ||
| } | ||
| return result != 0; | ||
| } | ||
|
|
||
| namespace { | ||
|
|
||
| // Once initialized, these variables hold borrowed references to Pandas static data. | ||
| // We should not use OwnedRef here because Python destructors would be | ||
| // called on a finalized interpreter. | ||
|
|
@@ -315,72 +368,43 @@ static PyObject* pandas_Timedelta = nullptr; | |
| static PyObject* pandas_Timestamp = nullptr; | ||
| static PyTypeObject* pandas_NaTType = nullptr; | ||
| static PyObject* pandas_DateOffset = nullptr; | ||
| static ModuleOnceRunner pandas_runner("pandas"); | ||
|
|
||
| void GetPandasStaticSymbols() { | ||
| OwnedRef pandas; | ||
|
|
||
| // Import pandas | ||
| Status s = ImportModule("pandas", &pandas); | ||
| if (!s.ok()) { | ||
| return; | ||
| } | ||
|
|
||
| #ifndef Py_GIL_DISABLED | ||
| // Since ImportModule can release the GIL, another thread could have | ||
| // already initialized the static data. | ||
| if (pandas_static_initialized) { | ||
| return; | ||
| } | ||
| #endif | ||
|
|
||
| OwnedRef ref; | ||
|
|
||
| // set NaT sentinel and its type | ||
| if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) { | ||
| pandas_NaT = ref.obj(); | ||
| // PyObject_Type returns a new reference but we trust that pandas.NaT will | ||
| // outlive our use of this PyObject* | ||
| pandas_NaTType = Py_TYPE(ref.obj()); | ||
| } | ||
|
|
||
| // retain a reference to Timedelta | ||
| if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) { | ||
| pandas_Timedelta = ref.obj(); | ||
| } | ||
| } // namespace | ||
|
|
||
| // retain a reference to Timestamp | ||
| if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) { | ||
| pandas_Timestamp = ref.obj(); | ||
| } | ||
| void InitPandasStaticData() { | ||
| pandas_runner.RunOnce([](OwnedRef& module) { | ||
| OwnedRef ref; | ||
|
|
||
| // set NaT sentinel and its type | ||
| if (ImportFromModule(module.obj(), "NaT", &ref).ok()) { | ||
| pandas_NaT = ref.obj(); | ||
| // PyObject_Type returns a new reference but we trust that pandas.NaT will | ||
| // outlive our use of this PyObject* | ||
| pandas_NaTType = Py_TYPE(ref.obj()); | ||
| } | ||
|
|
||
| // if pandas.NA exists, retain a reference to it | ||
| if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) { | ||
| pandas_NA = ref.obj(); | ||
| } | ||
| // retain a reference to Timedelta | ||
| if (ImportFromModule(module.obj(), "Timedelta", &ref).ok()) { | ||
| pandas_Timedelta = ref.obj(); | ||
| } | ||
|
|
||
| // Import DateOffset type | ||
| if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) { | ||
| pandas_DateOffset = ref.obj(); | ||
| } | ||
| } | ||
| // retain a reference to Timestamp | ||
| if (ImportFromModule(module.obj(), "Timestamp", &ref).ok()) { | ||
| pandas_Timestamp = ref.obj(); | ||
| } | ||
|
|
||
| } // namespace | ||
| // if pandas.NA exists, retain a reference to it | ||
| if (ImportFromModule(module.obj(), "NA", &ref).ok()) { | ||
| pandas_NA = ref.obj(); | ||
| } | ||
|
|
||
| #ifdef Py_GIL_DISABLED | ||
| void InitPandasStaticData() { | ||
| std::call_once(pandas_static_initialized, GetPandasStaticSymbols); | ||
| } | ||
| #else | ||
| void InitPandasStaticData() { | ||
| // NOTE: This is called with the GIL held. We needn't (and shouldn't, | ||
| // to avoid deadlocks) use an additional C++ lock (ARROW-10519). | ||
| if (pandas_static_initialized) { | ||
| return; | ||
| } | ||
| GetPandasStaticSymbols(); | ||
| pandas_static_initialized = true; | ||
| // Import DateOffset type | ||
| if (ImportFromModule(module.obj(), "DateOffset", &ref).ok()) { | ||
| pandas_DateOffset = ref.obj(); | ||
| } | ||
| }); | ||
| } | ||
| #endif | ||
|
|
||
| bool PandasObjectIsNull(PyObject* obj) { | ||
| if (!MayHaveNaN(obj)) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,6 +36,7 @@ | |
| #include "arrow/array/builder_primitive.h" | ||
| #include "arrow/array/builder_time.h" | ||
| #include "arrow/chunked_array.h" | ||
| #include "arrow/extension_type.h" | ||
| #include "arrow/result.h" | ||
| #include "arrow/scalar.h" | ||
| #include "arrow/status.h" | ||
|
|
@@ -512,7 +513,12 @@ class PyValue { | |
|
|
||
| static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, | ||
| PyBytesView& view) { | ||
| ARROW_RETURN_NOT_OK(view.ParseString(obj)); | ||
| // Check if obj is a uuid.UUID instance | ||
| if (internal::IsPyUuid(obj)) { | ||
| ARROW_RETURN_NOT_OK(view.ParseUuid(obj)); | ||
| } else { | ||
| ARROW_RETURN_NOT_OK(view.ParseString(obj)); | ||
| } | ||
| if (view.size != type->byte_width()) { | ||
| std::stringstream ss; | ||
| ss << "expected to be length " << type->byte_width() << " was " << view.size; | ||
|
|
@@ -1268,16 +1274,24 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject* | |
| // In some cases, type inference may be "loose", like strings. If the user | ||
| // passed pa.string(), then we will error if we encounter any non-UTF8 | ||
| // value. If not, then we will allow the result to be a BinaryArray | ||
| std::shared_ptr<DataType> extension_type; | ||
| if (options.type == nullptr) { | ||
| ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas)); | ||
| options.strict = false; | ||
| // If type inference returned an extension type, convert using | ||
| // the storage type and then wrap the result as an extension array | ||
| if (options.type->id() == Type::EXTENSION) { | ||
| extension_type = options.type; | ||
| options.type = checked_cast<const ExtensionType&>(*options.type).storage_type(); | ||
| } | ||
| } else { | ||
| options.strict = true; | ||
| } | ||
| ARROW_DCHECK_GE(size, 0); | ||
|
|
||
| ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter<PyConverter, PyConverterTrait>( | ||
| options.type, options, pool))); | ||
|
Comment on lines
1292
to
1293
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does |
||
| std::shared_ptr<ChunkedArray> result; | ||
| if (converter->may_overflow()) { | ||
| // The converter hierarchy contains binary- or list-like builders which can overflow | ||
| // depending on the input values. Wrap the converter with a chunker which detects | ||
|
|
@@ -1288,7 +1302,7 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject* | |
| } else { | ||
| RETURN_NOT_OK(chunked_converter->Extend(seq, size)); | ||
| } | ||
| return chunked_converter->ToChunkedArray(); | ||
| ARROW_ASSIGN_OR_RAISE(result, chunked_converter->ToChunkedArray()); | ||
| } else { | ||
| // If the converter can't overflow spare the capacity error checking on the hot-path, | ||
| // this improves the performance roughly by ~10% for primitive types. | ||
|
|
@@ -1297,8 +1311,13 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject* | |
| } else { | ||
| RETURN_NOT_OK(converter->Extend(seq, size)); | ||
| } | ||
| return converter->ToChunkedArray(); | ||
| ARROW_ASSIGN_OR_RAISE(result, converter->ToChunkedArray()); | ||
| } | ||
| // If we inferred an extension type, wrap as an extension array | ||
| if (extension_type != nullptr) { | ||
| return ExtensionType::WrapArray(extension_type, result); | ||
| } | ||
| return result; | ||
| } | ||
|
|
||
| } // namespace py | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.