Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion cpp/src/arrow/compute/kernels/hash_aggregate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <functional>
#include <memory>
#include <string>
#include <type_traits>
#include <vector>

#include "arrow/array/builder_nested.h"
Expand Down Expand Up @@ -277,8 +278,10 @@ template <typename T>
concept CBooleanConcept = std::same_as<T, bool>;

// XXX: Ideally we want to have std::floating_point<Float16> = true.
// Note: Using std::is_floating_point_v instead of std::floating_point concept
// for compatibility with older compilers (e.g., Apple Clang 14.0.0)
template <typename T>
concept CFloatingPointConcept = std::floating_point<T> || std::same_as<T, util::Float16>;
concept CFloatingPointConcept = std::is_floating_point_v<T> || std::same_as<T, util::Float16>;

template <typename T>
concept CDecimalConcept = std::same_as<T, Decimal32> || std::same_as<T, Decimal64> ||
Expand Down
88 changes: 87 additions & 1 deletion cpp/src/arrow/json/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

#include "arrow/json/parser.h"

#include <cctype>
#include <functional>
#include <limits>
#include <memory>
#include <string>
#include <string_view>
#include <tuple>
#include <unordered_map>
Expand Down Expand Up @@ -654,7 +656,8 @@ class HandlerBase : public BlockParser,
: BlockParser(pool),
builder_set_(pool),
field_index_(-1),
scalar_values_builder_(pool) {}
scalar_values_builder_(pool),
explicit_schema_(nullptr) {}

/// Retrieve a pointer to a builder from a BuilderPtr
template <Kind::type kind>
Expand All @@ -679,6 +682,15 @@ class HandlerBase : public BlockParser,
bool Bool(bool value) {
constexpr auto kind = Kind::kBoolean;
if (ARROW_PREDICT_FALSE(builder_.kind != kind)) {
// When explicit schema is provided, try to convert the value
if (explicit_schema_ != nullptr) {
std::string bool_str = value ? "true" : "false";
status_ = TryConvertAndAppend(kind, builder_, bool_str);
if (status_.ok()) {
return true;
}
// If conversion failed, fall through to error
}
status_ = IllegallyChangedTo(kind);
return status_.ok();
}
Expand Down Expand Up @@ -729,6 +741,7 @@ class HandlerBase : public BlockParser,

/// \brief Set up builders using an expected Schema
Status Initialize(const std::shared_ptr<Schema>& s) {
explicit_schema_ = s;
auto type = struct_({});
if (s) {
type = struct_(s->fields());
Expand Down Expand Up @@ -808,6 +821,14 @@ class HandlerBase : public BlockParser,
template <Kind::type kind>
Status AppendScalar(BuilderPtr builder, std::string_view scalar) {
if (ARROW_PREDICT_FALSE(builder.kind != kind)) {
// When explicit schema is provided, try to convert the value
if (explicit_schema_ != nullptr) {
Status convert_status = TryConvertAndAppend(kind, builder, scalar);
if (convert_status.ok()) {
return Status::OK();
}
// If conversion failed, fall through to error
}
return IllegallyChangedTo(kind);
}
auto index = static_cast<int32_t>(scalar_values_builder_.length());
Expand Down Expand Up @@ -918,6 +939,69 @@ class HandlerBase : public BlockParser,
" to ", Kind::Name(illegally_changed_to), " in row ", num_rows_);
}

/// Try to convert a JSON value to match the builder's expected kind
/// Returns OK if conversion succeeded and value was appended, error otherwise
Status TryConvertAndAppend(Kind::type json_kind, BuilderPtr builder,
std::string_view scalar) {
// Convert based on target builder kind
switch (builder.kind) {
case Kind::kString: {
// Target is string - can convert from number or boolean
if (json_kind == Kind::kNumber) {
// Number to string: the scalar already contains the number as string
// (due to kParseNumbersAsStringsFlag), so we can append directly
return AppendScalar<Kind::kString>(builder, scalar);
} else if (json_kind == Kind::kBoolean) {
// Boolean to string: convert true/false to "true"/"false"
std::string bool_str = (scalar == "true" || scalar == "1") ? "true" : "false";
return AppendScalar<Kind::kString>(builder, bool_str);
}
break;
}
case Kind::kNumber: {
// Target is number - can convert from numeric string
if (json_kind == Kind::kString) {
// Try to parse string as number
// The string should already be in scalar, we just need to verify it's numeric
// and append it (the parser flag kParseNumbersAsStringsFlag means numbers
// come as strings, so we can treat numeric strings as numbers)
return AppendScalar<Kind::kNumber>(builder, scalar);
} else if (json_kind == Kind::kBoolean) {
// Boolean to number: true -> 1, false -> 0
std::string num_str = (scalar == "true" || scalar == "1") ? "1" : "0";
return AppendScalar<Kind::kNumber>(builder, num_str);
}
break;
}
case Kind::kBoolean: {
// Target is boolean - can convert from number (0/1) or string ("true"/"false")
if (json_kind == Kind::kNumber) {
// Number to boolean: 0 -> false, non-zero -> true
std::string bool_str = (scalar == "0" || scalar == "0.0") ? "false" : "true";
return AppendScalar<Kind::kBoolean>(builder, bool_str);
} else if (json_kind == Kind::kString) {
// String to boolean: check if it's a boolean-like string
std::string lower_scalar;
lower_scalar.reserve(scalar.size());
for (char c : scalar) {
lower_scalar += std::tolower(static_cast<unsigned char>(c));
}
if (lower_scalar == "true" || lower_scalar == "1" || lower_scalar == "yes") {
return AppendScalar<Kind::kBoolean>(builder, "true");
} else if (lower_scalar == "false" || lower_scalar == "0" || lower_scalar == "no") {
return AppendScalar<Kind::kBoolean>(builder, "false");
}
}
break;
}
default:
break;
}
// Conversion not supported
return Status::Invalid("Cannot convert ", Kind::Name(json_kind), " to ",
Kind::Name(builder.kind));
}

/// Reserve storage for scalars, these can occupy almost all of the JSON buffer
Status ReserveScalarStorage(int64_t size) override {
auto available_storage = scalar_values_builder_.value_data_capacity() -
Expand All @@ -941,6 +1025,8 @@ class HandlerBase : public BlockParser,
// top of this stack == field_index_
std::vector<int> field_index_stack_;
StringBuilder scalar_values_builder_;
// Store explicit schema for type conversion
std::shared_ptr<Schema> explicit_schema_;
};

template <UnexpectedFieldBehavior>
Expand Down
26 changes: 26 additions & 0 deletions cpp/src/arrow/json/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,32 @@ TEST_P(BlockParserTypeError, FailOnInconvertible) {
"JSON parse error: Column(/a) changed from number to boolean in row 1"));
}

TEST_P(BlockParserTypeError, AllowNumberToStringConversion) {
// Test that number can be converted to string when explicit schema is provided
auto options = Options(schema({field("a", utf8())}));
std::shared_ptr<Array> parsed;
// This should succeed - number 456 should be converted to string "456"
ASSERT_OK(ParseFromString(options, "{\"a\":\"123\"}\n{\"a\":456}", &parsed));
auto struct_array = std::static_pointer_cast<StructArray>(parsed);
ASSERT_NE(struct_array, nullptr);
auto field_array = struct_array->GetFieldByName("a");
ASSERT_NE(field_array, nullptr);
ASSERT_EQ(field_array->length(), 2);
}

TEST_P(BlockParserTypeError, AllowStringToNumberConversion) {
// Test that numeric string can be converted to number when explicit schema is provided
auto options = Options(schema({field("a", int64())}));
std::shared_ptr<Array> parsed;
// This should succeed - string "456" should be converted to number 456
ASSERT_OK(ParseFromString(options, "{\"a\":123}\n{\"a\":\"456\"}", &parsed));
auto struct_array = std::static_pointer_cast<StructArray>(parsed);
ASSERT_NE(struct_array, nullptr);
auto field_array = struct_array->GetFieldByName("a");
ASSERT_NE(field_array, nullptr);
ASSERT_EQ(field_array->length(), 2);
}

TEST_P(BlockParserTypeError, FailOnNestedInconvertible) {
auto options = Options(schema({field("a", list(struct_({field("b", int32())})))}));
std::shared_ptr<Array> parsed;
Expand Down
2 changes: 1 addition & 1 deletion dev/release/download_rc_binaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _download_url(self, url, dest_path, *, extra_args=None):
os.remove(dest_path)
except IOError:
pass
if "OpenSSL" not in stderr:
if b"OpenSSL" not in stderr:
# We assume curl has already retried on other errors.
break
else:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/_static/versions.json
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,6 @@
{
"name": "1.0",
"version": "1.0/",
"url": "https://arrow.apache.org/docs/dev/"
"url": "https://arrow.apache.org/docs/1.0/"
}
]
7 changes: 5 additions & 2 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1136,7 +1136,7 @@ cdef class Array(_PandasConvertible):
result = self.ap.Diff(deref(other.ap))
return frombytes(result, safe=True)

def cast(self, object target_type=None, safe=None, options=None, memory_pool=None):
def cast(self, object target_type=None, safe=None, options=None, memory_pool=None, *, errors='raise'):
"""
Cast array values to another data type

Expand All @@ -1152,14 +1152,17 @@ cdef class Array(_PandasConvertible):
Additional checks pass by CastOptions
memory_pool : MemoryPool, optional
memory pool to use for allocations during function execution.
errors : str, default 'raise'
What to do if a value cannot be casted to the target type.
'raise' will raise an error, 'coerce' will produce a null.

Returns
-------
cast : Array
"""
self._assert_cpu()
return _pc().cast(self, target_type, safe=safe,
options=options, memory_pool=memory_pool)
options=options, memory_pool=memory_pool, errors=errors)

def view(self, object target_type):
"""
Expand Down
90 changes: 81 additions & 9 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,8 @@ def _make_global_functions():
utf8_zfill = utf8_zero_fill = globals()["utf8_zero_fill"]


def cast(arr, target_type=None, safe=None, options=None, memory_pool=None):
def cast(arr, target_type=None, safe=None, options=None, memory_pool=None, *,
errors='raise'):
"""
Cast array values to another data type. Can also be invoked as an array
instance method.
Expand All @@ -357,10 +358,11 @@ def cast(arr, target_type=None, safe=None, options=None, memory_pool=None):
Type to cast to
safe : bool, default True
Check for overflows or other unsafe conversions
options : CastOptions, default None
Additional checks pass by CastOptions
memory_pool : MemoryPool, optional
memory pool to use for allocations during function execution.
errors : str, default 'raise'
What to do if a value cannot be casted to the target type.
'raise' will raise an error, 'coerce' will produce a null.

Examples
--------
Expand Down Expand Up @@ -394,26 +396,96 @@ def cast(arr, target_type=None, safe=None, options=None, memory_pool=None):
>>> arr.cast('timestamp[ms]').type
TimestampType(timestamp[ms])

Use ``errors='coerce'`` to convert invalid values to null instead of
raising an error:

>>> arr = pa.array(["1.2", "3", "10-20", None, "nan", ""])
>>> cast(arr, pa.float64(), errors='coerce')
<pyarrow.lib.DoubleArray object at ...>
[
1.2,
3.0,
null,
null,
nan,
null
]

Returns
-------
casted : Array
The cast result as a new Array
"""
safe_vars_passed = (safe is not None) or (target_type is not None)

if safe_vars_passed and (options is not None):
raise ValueError("Must either pass values for 'target_type' and 'safe'"
" or pass a value for 'options'")

# Validate parameter combinations
if target_type is not None and options is not None:
raise ValueError("Must either pass 'target_type' (and optionally 'safe') "
"or pass 'options', but not both")

if options is None:
if target_type is None:
raise ValueError("Must provide either 'target_type' or 'options'")
target_type = pa.types.lib.ensure_type(target_type)
if safe is False:
options = CastOptions.unsafe(target_type)
else:
options = CastOptions.safe(target_type)

# Apply errors parameter regardless of whether options was provided
if errors == 'coerce':
options.null_on_error = True
elif errors == 'raise':
options.null_on_error = False
else:
raise ValueError("errors must be either 'raise' or 'coerce'")

return call_function("cast", [arr], options, memory_pool)


def is_castable(arr, target_type=None, options=None, memory_pool=None):
"""
Check if values can be casted to another data type.

Returns true if the value can be successfully casted to the target type.

Parameters
----------
arr : Array-like
target_type : DataType or str, optional
The PyArrow type to check castability to.
options : CastOptions, optional
Casting options. If passed, 'target_type' must be None.
memory_pool : MemoryPool, optional
If not passed, will allocate memory from the default memory pool.

Returns
-------
is_castable : Array
A boolean array

Examples
--------
>>> import pyarrow as pa
>>> import pyarrow.compute as pc
>>> arr = pa.array(["1.1", "2.2", "abc", "4.4"])
>>> pc.is_castable(arr, pa.float64())
<pyarrow.lib.BooleanArray object at ...>
[
true,
true,
false,
true
]
"""
if target_type is not None and options is not None:
raise ValueError("Must either pass 'target_type' or 'options'")

if options is None:
target_type = pa.types.lib.ensure_type(target_type)
options = CastOptions.safe(target_type)

return call_function("is_castable", [arr], options, memory_pool)


def index(data, value, start=None, end=None, *, memory_pool=None):
"""
Find the index of the first occurrence of a given value.
Expand Down
7 changes: 5 additions & 2 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ cdef class Scalar(_Weakrefable):
"""
return self.wrapped.get().is_valid

def cast(self, object target_type=None, safe=None, options=None, memory_pool=None):
def cast(self, object target_type=None, safe=None, options=None, memory_pool=None, *, errors='raise'):
"""
Cast scalar value to another data type.

Expand All @@ -86,13 +86,16 @@ cdef class Scalar(_Weakrefable):
Additional checks pass by CastOptions
memory_pool : MemoryPool, optional
memory pool to use for allocations during function execution.
errors : str, default 'raise'
What to do if a value cannot be casted to the target type.
'raise' will raise an error, 'coerce' will produce a null.

Returns
-------
scalar : A Scalar of the given target data type.
"""
return _pc().cast(self, target_type, safe=safe,
options=options, memory_pool=memory_pool)
options=options, memory_pool=memory_pool, errors=errors)

def validate(self, *, full=False):
"""
Expand Down
Loading
Loading