diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 0b787f461c21..1662dec7d44f 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1988,48 +1988,60 @@ const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 // fill into text but "fill_text" is empty, then return text directly. *out_len = text_len; return text; - } else if (return_length < actual_text_len) { + } + if (return_length < actual_text_len) { // case where it truncates the result on return length. *out_len = utf8_byte_pos(context, text, text_len, return_length); return text; - } else { - // case (return_length > actual_text_len) - // case where it needs to copy "fill_text" on the string left. The total number - // of chars to copy is given by (return_length - actual_text_len) - gdv_int32 return_char_length = evaluate_return_char_length( - text_len, actual_text_len, return_length, fill_text, fill_text_len); - char* ret = reinterpret_cast( - gdv_fn_context_arena_malloc(context, return_char_length)); + } + + gdv_int32 chars_to_pad = return_length - actual_text_len; + + // FAST PATH: Single-byte fill (most common - space padding) + if (fill_text_len == 1) { + gdv_int32 out_len_bytes = chars_to_pad + text_len; + char* ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, out_len_bytes)); if (ret == nullptr) { gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); *out_len = 0; return ""; } - // try to fulfill the return string with the "fill_text" continuously - int32_t copied_chars_count = 0; - int32_t copied_chars_position = 0; - while (copied_chars_count < return_length - actual_text_len) { - int32_t char_len; - int32_t fill_index; - // for each char, evaluate its length to consider it when mem copying - for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) { - if (copied_chars_count >= return_length - actual_text_len) { - break; - } - char_len = utf8_char_length(fill_text[fill_index]); - // ignore invalid char on the fill text, considering it as size 1 - if (char_len == 0) char_len += 1; - copied_chars_count++; - } - memcpy(ret + copied_chars_position, fill_text, fill_index); - copied_chars_position += fill_index; - } - // after fulfilling the text, copy the main string - memcpy(ret + copied_chars_position, text, text_len); - *out_len = copied_chars_position + text_len; + memset(ret, fill_text[0], chars_to_pad); + memcpy(ret + chars_to_pad, text, text_len); + *out_len = out_len_bytes; return ret; } + + // GENERAL PATH: Multi-byte fill - use evaluate_return_char_length for buffer size + gdv_int32 return_char_length = evaluate_return_char_length( + text_len, actual_text_len, return_length, fill_text, fill_text_len); + char* ret = reinterpret_cast( + gdv_fn_context_arena_malloc(context, return_char_length)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + // Fill using doubling strategy (O(log n) memcpy calls) + gdv_int32 total_fill_bytes = return_char_length - text_len; + // Copy only as much of fill_text as we need (may be less than fill_text_len) + gdv_int32 initial_copy = std::min(fill_text_len, total_fill_bytes); + memcpy(ret, fill_text, initial_copy); + gdv_int32 written = initial_copy; + while (written * 2 <= total_fill_bytes) { + memcpy(ret + written, ret, written); + written *= 2; + } + if (written < total_fill_bytes) { + memcpy(ret + written, ret, total_fill_bytes - written); + } + + memcpy(ret + total_fill_bytes, text, text_len); + *out_len = return_char_length; + return ret; } FORCE_INLINE @@ -2054,47 +2066,60 @@ const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 // fill into text but "fill_text" is empty, then return text directly. *out_len = text_len; return text; - } else if (return_length < actual_text_len) { + } + if (return_length < actual_text_len) { // case where it truncates the result on return length. *out_len = utf8_byte_pos(context, text, text_len, return_length); return text; - } else { - // case (return_length > actual_text_len) - // case where it needs to copy "fill_text" on the string right - gdv_int32 return_char_length = evaluate_return_char_length( - text_len, actual_text_len, return_length, fill_text, fill_text_len); - char* ret = reinterpret_cast( - gdv_fn_context_arena_malloc(context, return_char_length)); + } + + gdv_int32 chars_to_pad = return_length - actual_text_len; + + // FAST PATH: Single-byte fill (most common - space padding) + if (fill_text_len == 1) { + gdv_int32 out_len_bytes = chars_to_pad + text_len; + char* ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, out_len_bytes)); if (ret == nullptr) { gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); *out_len = 0; return ""; } - // fulfill the initial text copying the main input string memcpy(ret, text, text_len); - // try to fulfill the return string with the "fill_text" continuously - int32_t copied_chars_count = 0; - int32_t copied_chars_position = 0; - while (actual_text_len + copied_chars_count < return_length) { - int32_t char_len; - int32_t fill_length; - // for each char, evaluate its length to consider it when mem copying - for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) { - if (actual_text_len + copied_chars_count >= return_length) { - break; - } - char_len = utf8_char_length(fill_text[fill_length]); - // ignore invalid char on the fill text, considering it as size 1 - if (char_len == 0) char_len += 1; - copied_chars_count++; - } - memcpy(ret + text_len + copied_chars_position, fill_text, fill_length); - copied_chars_position += fill_length; - } - *out_len = copied_chars_position + text_len; + memset(ret + text_len, fill_text[0], chars_to_pad); + *out_len = out_len_bytes; return ret; } + + // GENERAL PATH: Multi-byte fill - use evaluate_return_char_length for buffer size + gdv_int32 return_char_length = evaluate_return_char_length( + text_len, actual_text_len, return_length, fill_text, fill_text_len); + char* ret = reinterpret_cast( + gdv_fn_context_arena_malloc(context, return_char_length)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + // Copy text first, then fill using doubling strategy + memcpy(ret, text, text_len); + gdv_int32 total_fill_bytes = return_char_length - text_len; + // Copy only as much of fill_text as we need (may be less than fill_text_len) + gdv_int32 initial_copy = std::min(fill_text_len, total_fill_bytes); + memcpy(ret + text_len, fill_text, initial_copy); + gdv_int32 written = initial_copy; + while (written * 2 <= total_fill_bytes) { + memcpy(ret + text_len + written, ret + text_len, written); + written *= 2; + } + if (written < total_fill_bytes) { + memcpy(ret + text_len + written, ret + text_len, total_fill_bytes - written); + } + + *out_len = return_char_length; + return ret; } FORCE_INLINE diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index e0248667e3df..d51b73b63351 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1318,6 +1318,101 @@ TEST(TestStringOps, TestLpadString) { out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, -1, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "x", 1, 65536, "😀", 4, &out_len); + EXPECT_EQ(out_len, 65535 * 4 + 1); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_str[out_len - 1], 'x'); + EXPECT_EQ(static_cast(out_str[0]), 0xF0); + EXPECT_EQ(static_cast(out_str[1]), 0x9F); + EXPECT_EQ(static_cast(out_str[2]), 0x98); + EXPECT_EQ(static_cast(out_str[3]), 0x80); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "A", 1, 65536, "哈", 3, &out_len); + EXPECT_EQ(out_len, 65535 * 3 + 1); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_str[out_len - 1], 'A'); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, ".", 1, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ".X"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 65536, "@", 1, &out_len); + EXPECT_EQ(out_len, 65536); + for (int i = 0; i < 100; i++) { + EXPECT_EQ(out_str[i], '@') << "Mismatch at position " << i; + } + EXPECT_EQ(out_str[out_len - 1], 'Z'); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "END", 3, 11, "ab", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "ababababEND"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "END", 3, 10, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "abcabcaEND"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 5, "αβ", 4, &out_len); + EXPECT_EQ(out_len, 9); + EXPECT_EQ(std::string(out_str, out_len), "αβαβX"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 4, "中文", 6, &out_len); + EXPECT_EQ(out_len, 10); + EXPECT_EQ(std::string(out_str, out_len), "中文中Y"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 4, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "abcX"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 7, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "abcabcX"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 13, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "abcabcabcabcX"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 10, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "abcabcabcX"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "E", 1, 129, "ab", 2, &out_len); + EXPECT_EQ(out_len, 129); + EXPECT_EQ(out_str[0], 'a'); + EXPECT_EQ(out_str[1], 'b'); + EXPECT_EQ(out_str[126], 'a'); + EXPECT_EQ(out_str[127], 'b'); + EXPECT_EQ(out_str[128], 'E'); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "E", 1, 127, "ab", 2, &out_len); + EXPECT_EQ(out_len, 127); + EXPECT_EQ(out_str[0], 'a'); + EXPECT_EQ(out_str[125], 'b'); + EXPECT_EQ(out_str[126], 'E'); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, "abc", 3, &out_len); + EXPECT_EQ(out_len, 2); + EXPECT_EQ(std::string(out_str, out_len), "aX"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 3, "abcde", 5, &out_len); + EXPECT_EQ(out_len, 3); + EXPECT_EQ(std::string(out_str, out_len), "abY"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 2, "αβ", 4, &out_len); + EXPECT_EQ(out_len, 3); + EXPECT_EQ(std::string(out_str, out_len), "αZ"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "A", 1, 2, "中文字", 9, &out_len); + EXPECT_EQ(out_len, 4); + EXPECT_EQ(std::string(out_str, out_len), "中A"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "B", 1, 3, "中文字", 9, &out_len); + EXPECT_EQ(out_len, 7); + EXPECT_EQ(std::string(out_str, out_len), "中文B"); + + std::string large_text(5000, 'X'); + std::string large_fill; + for (int i = 0; i < 50; ++i) { + large_fill += "α"; + } + out_str = lpad_utf8_int32_utf8(ctx_ptr, large_text.c_str(), 5000, 5001, + large_fill.c_str(), 100, &out_len); + EXPECT_EQ(out_len, 5002); + EXPECT_EQ(std::string(out_str, 2), "α"); + EXPECT_EQ(std::string(out_str + 2, 5000), large_text); } TEST(TestStringOps, TestRpadString) { @@ -1396,6 +1491,101 @@ TEST(TestStringOps, TestRpadString) { out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, -1, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "x", 1, 65536, "😀", 4, &out_len); + EXPECT_EQ(out_len, 1 + 65535 * 4); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_str[0], 'x'); + EXPECT_EQ(static_cast(out_str[out_len - 4]), 0xF0); + EXPECT_EQ(static_cast(out_str[out_len - 3]), 0x9F); + EXPECT_EQ(static_cast(out_str[out_len - 2]), 0x98); + EXPECT_EQ(static_cast(out_str[out_len - 1]), 0x80); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "A", 1, 65536, "哈", 3, &out_len); + EXPECT_EQ(out_len, 1 + 65535 * 3); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(out_str[0], 'A'); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, ".", 1, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "X."); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 65536, "@", 1, &out_len); + EXPECT_EQ(out_len, 65536); + EXPECT_EQ(out_str[0], 'Z'); + for (int i = 1; i < 100; i++) { + EXPECT_EQ(out_str[i], '@') << "Mismatch at position " << i; + } + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "BEG", 3, 11, "ab", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "BEGabababab"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "BEG", 3, 10, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "BEGabcabca"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 5, "αβ", 4, &out_len); + EXPECT_EQ(out_len, 9); + EXPECT_EQ(std::string(out_str, out_len), "Xαβαβ"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 4, "中文", 6, &out_len); + EXPECT_EQ(out_len, 10); + EXPECT_EQ(std::string(out_str, out_len), "Y中文中"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 4, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Xabc"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 7, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Xabcabc"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 13, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Xabcabcabcabc"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 10, "abc", 3, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Xabcabcabc"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "S", 1, 129, "ab", 2, &out_len); + EXPECT_EQ(out_len, 129); + EXPECT_EQ(out_str[0], 'S'); + EXPECT_EQ(out_str[1], 'a'); + EXPECT_EQ(out_str[2], 'b'); + EXPECT_EQ(out_str[127], 'a'); + EXPECT_EQ(out_str[128], 'b'); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "S", 1, 127, "ab", 2, &out_len); + EXPECT_EQ(out_len, 127); + EXPECT_EQ(out_str[0], 'S'); + EXPECT_EQ(out_str[125], 'a'); + EXPECT_EQ(out_str[126], 'b'); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, "abc", 3, &out_len); + EXPECT_EQ(out_len, 2); + EXPECT_EQ(std::string(out_str, out_len), "Xa"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 3, "abcde", 5, &out_len); + EXPECT_EQ(out_len, 3); + EXPECT_EQ(std::string(out_str, out_len), "Yab"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 2, "αβ", 4, &out_len); + EXPECT_EQ(out_len, 3); + EXPECT_EQ(std::string(out_str, out_len), "Zα"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "A", 1, 2, "中文字", 9, &out_len); + EXPECT_EQ(out_len, 4); + EXPECT_EQ(std::string(out_str, out_len), "A中"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "B", 1, 3, "中文字", 9, &out_len); + EXPECT_EQ(out_len, 7); + EXPECT_EQ(std::string(out_str, out_len), "B中文"); + + std::string large_text(5000, 'X'); + std::string large_fill; + for (int i = 0; i < 50; ++i) { + large_fill += "α"; + } + out_str = rpad_utf8_int32_utf8(ctx_ptr, large_text.c_str(), 5000, 5001, + large_fill.c_str(), 100, &out_len); + EXPECT_EQ(out_len, 5002); + EXPECT_EQ(std::string(out_str, 5000), large_text); + EXPECT_EQ(std::string(out_str + 5000, 2), "α"); } TEST(TestStringOps, TestRtrim) {