diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 423a0123c05..ab2ce9cdc74 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -33,6 +33,7 @@ #include "arrow/json/parser.h" #include "arrow/json/rapidjson_defs.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/visit_type_inline.h" @@ -110,20 +111,19 @@ struct GenerateImpl { return OK(writer.Double(val)); } - Status GenerateAscii(const DataType&) { - auto size = std::poisson_distribution<>{4}(e); - std::uniform_int_distribution gen_char(32, 126); // FIXME generate UTF8 - std::string s(size, '\0'); - for (char& ch : s) ch = static_cast(gen_char(e)); - return OK(writer.String(s.c_str())); + Status GenerateUtf8(const DataType&) { + auto num_codepoints = std::poisson_distribution<>{4}(e); + auto seed = std::uniform_int_distribution{}(e); + std::string s = RandomUtf8String(seed, num_codepoints); + return OK(writer.String(s)); } template enable_if_base_binary Visit(const T& t) { - return GenerateAscii(t); + return GenerateUtf8(t); } - Status Visit(const BinaryViewType& t) { return GenerateAscii(t); } + Status Visit(const BinaryViewType& t) { return GenerateUtf8(t); } template enable_if_list_like Visit(const T& t) { diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index c50387e4909..f73dbd5bbf7 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -1475,4 +1475,81 @@ void rand_month_day_nanos(int64_t N, }); } +std::string RandomUtf8String(random::SeedType seed, int num_chars) { + arrow::random::pcg32 gen(seed); + std::string s; + s.reserve(num_chars * 3); // Reserve for average 3 bytes per codepoint + + std::uniform_int_distribution plane_dist(0, 3); + std::bernoulli_distribution bmp_range_dist(0.5); + std::uniform_int_distribution bmp_lower_dist(0x0020, 0xD7FF); + std::uniform_int_distribution bmp_upper_dist(0xE000, 0xFFFD); + std::uniform_int_distribution smp_dist(0x10000, 0x1FFFF); + std::uniform_int_distribution sip_dist(0x20000, 0x2FFFF); + std::uniform_int_distribution high_plane_dist(0x30000, 0x10FFFF); + + for (int i = 0; i < num_chars; ++i) { + uint32_t codepoint; + uint32_t plane = plane_dist(gen); + + if (plane == 0) { + // Basic Multilingual Plane (BMP): U+0000 to U+FFFF + // Exclude surrogate code points (U+D800 to U+DFFF) + // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71) + // Exclude control chars below U+0020 for readability + // Generate from two ranges with equal probability (overrepresents the smaller + // upper range): + // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability) + // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability) + if (bmp_range_dist(gen)) { + // Lower range: U+0020 to U+D7FF (before surrogate range) + codepoint = bmp_lower_dist(gen); + } else { + // Upper range: U+E000 to U+FFFD (after surrogate range) + // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF + // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included + // as they are valid Unicode scalar values per the Unicode Standard + codepoint = bmp_upper_dist(gen); + } + } else if (plane == 1) { + // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF + // https://www.unicode.org/roadmaps/smp/ + codepoint = smp_dist(gen); + } else if (plane == 2) { + // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF + // https://www.unicode.org/roadmaps/sip/ + codepoint = sip_dist(gen); + } else { + // Planes 3–16: U+30000–U+10FFFF + // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF + // Max valid Unicode codepoint is U+10FFFF per the Standard + // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9) + codepoint = high_plane_dist(gen); + } + + // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition) + // https://www.rfc-editor.org/rfc/rfc3629.html#section-3 + if (codepoint <= 0x7F) { + // 1-byte sequence: 0xxxxxxx + s.push_back(static_cast(codepoint)); + } else if (codepoint <= 0x7FF) { + // 2-byte sequence: 110xxxxx 10xxxxxx + s.push_back(static_cast(0xC0 | (codepoint >> 6))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } else if (codepoint <= 0xFFFF) { + // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx + s.push_back(static_cast(0xE0 | (codepoint >> 12))); + s.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } else { + // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + s.push_back(static_cast(0xF0 | (codepoint >> 18))); + s.push_back(static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + s.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } + } + return s; +} + } // namespace arrow diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index d9122915a09..f820e643986 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -729,6 +729,21 @@ ARROW_TESTING_EXPORT void rand_month_day_nanos(int64_t N, std::vector* out); +/// \brief Generate a random UTF-8 encoded string +/// +/// Generates a string with valid UTF-8 encoding from random Unicode scalar values. +/// The generated string contains num_chars code points sampled uniformly +/// across the Basic Multilingual Plane (BMP), Supplementary Multilingual Plane (SMP), +/// Supplementary Ideographic Plane (SIP), and higher planes (up to U+10FFFF). +/// Surrogate code points (U+D800-U+DFFF) are excluded as they are not valid +/// Unicode scalar values. +/// +/// \param[in] seed Random seed for reproducibility +/// \param[in] num_chars Number of Unicode code points to generate +/// \return a generated UTF-8 encoded string +ARROW_TESTING_EXPORT +std::string RandomUtf8String(random::SeedType seed, int num_chars); + template void randint(int64_t N, T lower, T upper, std::vector* out) { const int random_seed = 0;