From 64f91009f290b5495f5de8475f23db697f610bc7 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Thu, 22 Jan 2026 18:19:30 +0900 Subject: [PATCH 1/3] [C++] Generate proper UTF-8 strings in JSON test utilities --- cpp/src/arrow/json/test_common.h | 79 +++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 423a0123c05..dbaaa9f82ce 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -110,20 +110,85 @@ struct GenerateImpl { return OK(writer.Double(val)); } - Status GenerateAscii(const DataType&) { - auto size = std::poisson_distribution<>{4}(e); - std::uniform_int_distribution gen_char(32, 126); // FIXME generate UTF8 - std::string s(size, '\0'); - for (char& ch : s) ch = static_cast(gen_char(e)); + Status GenerateUtf8(const DataType&) { + // Generate random UTF-8 encoded strings from valid Unicode scalar values. + auto num_codepoints = std::poisson_distribution<>{4}(e); + std::string s; + s.reserve(num_codepoints * 3); + + for (int i = 0; i < num_codepoints; ++i) { + uint32_t codepoint; + std::uniform_int_distribution plane_dist(0, 3); + uint32_t plane = plane_dist(e); + + if (plane == 0) { + // Basic Multilingual Plane (BMP): U+0000 to U+FFFF + // Exclude surrogate code points (U+D800 to U+DFFF) + // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71) + // Exclude control chars below U+0020 for readability + // Generate from two ranges with equal probability (overrepresents the smaller + // upper range): + // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability) + // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability) + if (std::bernoulli_distribution(0.5)(e)) { + // Lower range: U+0020 to U+D7FF (before surrogate range) + codepoint = std::uniform_int_distribution(0x0020, 0xD7FF)(e); + } else { + // Upper range: U+E000 to U+FFFD (after surrogate range) + // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF + // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included + // as they are valid Unicode scalar values per the Unicode Standard + codepoint = std::uniform_int_distribution(0xE000, 0xFFFD)(e); + } + } else if (plane == 1) { + // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF + // https://www.unicode.org/roadmaps/smp/ + codepoint = std::uniform_int_distribution(0x10000, 0x1FFFF)(e); + } else if (plane == 2) { + // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF + // https://www.unicode.org/roadmaps/sip/ + codepoint = std::uniform_int_distribution(0x20000, 0x2FFFF)(e); + } else { + // Planes 3–16: U+30000–U+10FFFF + // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF + // Max valid Unicode codepoint is U+10FFFF per the Standard + // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9) + codepoint = std::uniform_int_distribution(0x30000, 0x10FFFF)(e); + } + + // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition) + // https://www.rfc-editor.org/rfc/rfc3629.html#section-3 + if (codepoint <= 0x7F) { + // 1-byte sequence: 0xxxxxxx + s.push_back(static_cast(codepoint)); + } else if (codepoint <= 0x7FF) { + // 2-byte sequence: 110xxxxx 10xxxxxx + s.push_back(static_cast(0xC0 | (codepoint >> 6))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } else if (codepoint <= 0xFFFF) { + // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx + s.push_back(static_cast(0xE0 | (codepoint >> 12))); + s.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } else { + // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + s.push_back(static_cast(0xF0 | (codepoint >> 18))); + s.push_back(static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + s.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } + } + // Using c_str() is safe here because generation excludes U+0000 (no embedded nulls). + // U+0000 can only exist in plane 0 (BMP), and BMP generation starts at U+0020. return OK(writer.String(s.c_str())); } template enable_if_base_binary Visit(const T& t) { - return GenerateAscii(t); + return GenerateUtf8(t); } - Status Visit(const BinaryViewType& t) { return GenerateAscii(t); } + Status Visit(const BinaryViewType& t) { return GenerateUtf8(t); } template enable_if_list_like Visit(const T& t) { From e34d25f94632cc06e53c09ebb44db2746d1a60c8 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Wed, 28 Jan 2026 16:49:07 +0900 Subject: [PATCH 2/3] review comment --- cpp/src/arrow/json/test_common.h | 72 ++------------------------------ cpp/src/arrow/testing/random.cc | 71 +++++++++++++++++++++++++++++++ cpp/src/arrow/testing/random.h | 14 +++++++ 3 files changed, 88 insertions(+), 69 deletions(-) diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index dbaaa9f82ce..c5cb5ec3889 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -33,6 +33,7 @@ #include "arrow/json/parser.h" #include "arrow/json/rapidjson_defs.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/visit_type_inline.h" @@ -111,76 +112,9 @@ struct GenerateImpl { } Status GenerateUtf8(const DataType&) { - // Generate random UTF-8 encoded strings from valid Unicode scalar values. auto num_codepoints = std::poisson_distribution<>{4}(e); - std::string s; - s.reserve(num_codepoints * 3); - - for (int i = 0; i < num_codepoints; ++i) { - uint32_t codepoint; - std::uniform_int_distribution plane_dist(0, 3); - uint32_t plane = plane_dist(e); - - if (plane == 0) { - // Basic Multilingual Plane (BMP): U+0000 to U+FFFF - // Exclude surrogate code points (U+D800 to U+DFFF) - // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71) - // Exclude control chars below U+0020 for readability - // Generate from two ranges with equal probability (overrepresents the smaller - // upper range): - // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability) - // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability) - if (std::bernoulli_distribution(0.5)(e)) { - // Lower range: U+0020 to U+D7FF (before surrogate range) - codepoint = std::uniform_int_distribution(0x0020, 0xD7FF)(e); - } else { - // Upper range: U+E000 to U+FFFD (after surrogate range) - // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF - // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included - // as they are valid Unicode scalar values per the Unicode Standard - codepoint = std::uniform_int_distribution(0xE000, 0xFFFD)(e); - } - } else if (plane == 1) { - // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF - // https://www.unicode.org/roadmaps/smp/ - codepoint = std::uniform_int_distribution(0x10000, 0x1FFFF)(e); - } else if (plane == 2) { - // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF - // https://www.unicode.org/roadmaps/sip/ - codepoint = std::uniform_int_distribution(0x20000, 0x2FFFF)(e); - } else { - // Planes 3–16: U+30000–U+10FFFF - // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF - // Max valid Unicode codepoint is U+10FFFF per the Standard - // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9) - codepoint = std::uniform_int_distribution(0x30000, 0x10FFFF)(e); - } - - // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition) - // https://www.rfc-editor.org/rfc/rfc3629.html#section-3 - if (codepoint <= 0x7F) { - // 1-byte sequence: 0xxxxxxx - s.push_back(static_cast(codepoint)); - } else if (codepoint <= 0x7FF) { - // 2-byte sequence: 110xxxxx 10xxxxxx - s.push_back(static_cast(0xC0 | (codepoint >> 6))); - s.push_back(static_cast(0x80 | (codepoint & 0x3F))); - } else if (codepoint <= 0xFFFF) { - // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx - s.push_back(static_cast(0xE0 | (codepoint >> 12))); - s.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); - s.push_back(static_cast(0x80 | (codepoint & 0x3F))); - } else { - // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - s.push_back(static_cast(0xF0 | (codepoint >> 18))); - s.push_back(static_cast(0x80 | ((codepoint >> 12) & 0x3F))); - s.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); - s.push_back(static_cast(0x80 | (codepoint & 0x3F))); - } - } - // Using c_str() is safe here because generation excludes U+0000 (no embedded nulls). - // U+0000 can only exist in plane 0 (BMP), and BMP generation starts at U+0020. - return OK(writer.String(s.c_str())); + std::string s = RandomUtf8String(num_codepoints); + return OK(writer.String(s)); } template diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index c50387e4909..1d7e84790ac 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -1475,4 +1475,75 @@ void rand_month_day_nanos(int64_t N, }); } +std::string RandomUtf8String(int num_chars) { + std::random_device rd; + std::default_random_engine gen(rd()); + std::string s; + s.reserve(num_chars * 3); // Reserve for average 3 bytes per codepoint + + for (int i = 0; i < num_chars; ++i) { + uint32_t codepoint; + std::uniform_int_distribution plane_dist(0, 3); + uint32_t plane = plane_dist(gen); + + if (plane == 0) { + // Basic Multilingual Plane (BMP): U+0000 to U+FFFF + // Exclude surrogate code points (U+D800 to U+DFFF) + // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71) + // Exclude control chars below U+0020 for readability + // Generate from two ranges with equal probability (overrepresents the smaller + // upper range): + // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability) + // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability) + if (std::bernoulli_distribution(0.5)(gen)) { + // Lower range: U+0020 to U+D7FF (before surrogate range) + codepoint = std::uniform_int_distribution(0x0020, 0xD7FF)(gen); + } else { + // Upper range: U+E000 to U+FFFD (after surrogate range) + // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF + // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included + // as they are valid Unicode scalar values per the Unicode Standard + codepoint = std::uniform_int_distribution(0xE000, 0xFFFD)(gen); + } + } else if (plane == 1) { + // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF + // https://www.unicode.org/roadmaps/smp/ + codepoint = std::uniform_int_distribution(0x10000, 0x1FFFF)(gen); + } else if (plane == 2) { + // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF + // https://www.unicode.org/roadmaps/sip/ + codepoint = std::uniform_int_distribution(0x20000, 0x2FFFF)(gen); + } else { + // Planes 3–16: U+30000–U+10FFFF + // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF + // Max valid Unicode codepoint is U+10FFFF per the Standard + // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9) + codepoint = std::uniform_int_distribution(0x30000, 0x10FFFF)(gen); + } + + // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition) + // https://www.rfc-editor.org/rfc/rfc3629.html#section-3 + if (codepoint <= 0x7F) { + // 1-byte sequence: 0xxxxxxx + s.push_back(static_cast(codepoint)); + } else if (codepoint <= 0x7FF) { + // 2-byte sequence: 110xxxxx 10xxxxxx + s.push_back(static_cast(0xC0 | (codepoint >> 6))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } else if (codepoint <= 0xFFFF) { + // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx + s.push_back(static_cast(0xE0 | (codepoint >> 12))); + s.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } else { + // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + s.push_back(static_cast(0xF0 | (codepoint >> 18))); + s.push_back(static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + s.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } + } + return s; +} + } // namespace arrow diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index d9122915a09..854e5443bfa 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -729,6 +729,20 @@ ARROW_TESTING_EXPORT void rand_month_day_nanos(int64_t N, std::vector* out); +/// \brief Generate a random UTF-8 encoded string +/// +/// Generates a string with valid UTF-8 encoding from random Unicode scalar values. +/// The generated string contains num_chars code points sampled uniformly +/// across the Basic Multilingual Plane (BMP), Supplementary Multilingual Plane (SMP), +/// Supplementary Ideographic Plane (SIP), and higher planes (up to U+10FFFF). +/// Surrogate code points (U+D800-U+DFFF) are excluded as they are not valid +/// Unicode scalar values. +/// +/// \param[in] num_chars Number of Unicode code points to generate +/// \return a generated UTF-8 encoded string +ARROW_TESTING_EXPORT +std::string RandomUtf8String(int num_chars); + template void randint(int64_t N, T lower, T upper, std::vector* out) { const int random_seed = 0; From b681f2b59c9a5f3827b6ed73e8027262b977fbe0 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Thu, 29 Jan 2026 15:26:27 +0900 Subject: [PATCH 3/3] Address a review comment --- cpp/src/arrow/json/test_common.h | 3 ++- cpp/src/arrow/testing/random.cc | 26 ++++++++++++++++---------- cpp/src/arrow/testing/random.h | 3 ++- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index c5cb5ec3889..ab2ce9cdc74 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -113,7 +113,8 @@ struct GenerateImpl { Status GenerateUtf8(const DataType&) { auto num_codepoints = std::poisson_distribution<>{4}(e); - std::string s = RandomUtf8String(num_codepoints); + auto seed = std::uniform_int_distribution{}(e); + std::string s = RandomUtf8String(seed, num_codepoints); return OK(writer.String(s)); } diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 1d7e84790ac..f73dbd5bbf7 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -1475,15 +1475,21 @@ void rand_month_day_nanos(int64_t N, }); } -std::string RandomUtf8String(int num_chars) { - std::random_device rd; - std::default_random_engine gen(rd()); +std::string RandomUtf8String(random::SeedType seed, int num_chars) { + arrow::random::pcg32 gen(seed); std::string s; s.reserve(num_chars * 3); // Reserve for average 3 bytes per codepoint + std::uniform_int_distribution plane_dist(0, 3); + std::bernoulli_distribution bmp_range_dist(0.5); + std::uniform_int_distribution bmp_lower_dist(0x0020, 0xD7FF); + std::uniform_int_distribution bmp_upper_dist(0xE000, 0xFFFD); + std::uniform_int_distribution smp_dist(0x10000, 0x1FFFF); + std::uniform_int_distribution sip_dist(0x20000, 0x2FFFF); + std::uniform_int_distribution high_plane_dist(0x30000, 0x10FFFF); + for (int i = 0; i < num_chars; ++i) { uint32_t codepoint; - std::uniform_int_distribution plane_dist(0, 3); uint32_t plane = plane_dist(gen); if (plane == 0) { @@ -1495,30 +1501,30 @@ std::string RandomUtf8String(int num_chars) { // upper range): // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability) // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability) - if (std::bernoulli_distribution(0.5)(gen)) { + if (bmp_range_dist(gen)) { // Lower range: U+0020 to U+D7FF (before surrogate range) - codepoint = std::uniform_int_distribution(0x0020, 0xD7FF)(gen); + codepoint = bmp_lower_dist(gen); } else { // Upper range: U+E000 to U+FFFD (after surrogate range) // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included // as they are valid Unicode scalar values per the Unicode Standard - codepoint = std::uniform_int_distribution(0xE000, 0xFFFD)(gen); + codepoint = bmp_upper_dist(gen); } } else if (plane == 1) { // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF // https://www.unicode.org/roadmaps/smp/ - codepoint = std::uniform_int_distribution(0x10000, 0x1FFFF)(gen); + codepoint = smp_dist(gen); } else if (plane == 2) { // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF // https://www.unicode.org/roadmaps/sip/ - codepoint = std::uniform_int_distribution(0x20000, 0x2FFFF)(gen); + codepoint = sip_dist(gen); } else { // Planes 3–16: U+30000–U+10FFFF // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF // Max valid Unicode codepoint is U+10FFFF per the Standard // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9) - codepoint = std::uniform_int_distribution(0x30000, 0x10FFFF)(gen); + codepoint = high_plane_dist(gen); } // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition) diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 854e5443bfa..f820e643986 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -738,10 +738,11 @@ void rand_month_day_nanos(int64_t N, /// Surrogate code points (U+D800-U+DFFF) are excluded as they are not valid /// Unicode scalar values. /// +/// \param[in] seed Random seed for reproducibility /// \param[in] num_chars Number of Unicode code points to generate /// \return a generated UTF-8 encoded string ARROW_TESTING_EXPORT -std::string RandomUtf8String(int num_chars); +std::string RandomUtf8String(random::SeedType seed, int num_chars); template void randint(int64_t N, T lower, T upper, std::vector* out) {