Skip to content

Commit 7dacbd0

Browse files
authored
GH-48941: [C++] Generate proper UTF-8 strings in JSON test utilities (#48943)
### Rationale for this change The JSON test utility `GenerateAscii` was only generating ASCII characters. Should better have the test coverage for proper UTF-8 and Unicode handling. ### What changes are included in this PR? Replaced ASCII-only generation with proper UTF-8 string generation that produces valid Unicode scalar values across all planes (BMP, SMP, SIP, planes 3-16), correctly encoded per RFC 3629. Added that function as an util. ### Are these changes tested? There are existent tests for JSON. ### Are there any user-facing changes? No, test-only. * GitHub Issue: #48941 Authored-by: Hyukjin Kwon <gurwls223@apache.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent c0d5a59 commit 7dacbd0

File tree

3 files changed

+100
-8
lines changed

3 files changed

+100
-8
lines changed

cpp/src/arrow/json/test_common.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "arrow/json/parser.h"
3434
#include "arrow/json/rapidjson_defs.h"
3535
#include "arrow/testing/gtest_util.h"
36+
#include "arrow/testing/random.h"
3637
#include "arrow/type.h"
3738
#include "arrow/util/checked_cast.h"
3839
#include "arrow/visit_type_inline.h"
@@ -110,20 +111,19 @@ struct GenerateImpl {
110111
return OK(writer.Double(val));
111112
}
112113

113-
Status GenerateAscii(const DataType&) {
114-
auto size = std::poisson_distribution<>{4}(e);
115-
std::uniform_int_distribution<uint16_t> gen_char(32, 126); // FIXME generate UTF8
116-
std::string s(size, '\0');
117-
for (char& ch : s) ch = static_cast<char>(gen_char(e));
118-
return OK(writer.String(s.c_str()));
114+
Status GenerateUtf8(const DataType&) {
115+
auto num_codepoints = std::poisson_distribution<>{4}(e);
116+
auto seed = std::uniform_int_distribution<uint32_t>{}(e);
117+
std::string s = RandomUtf8String(seed, num_codepoints);
118+
return OK(writer.String(s));
119119
}
120120

121121
template <typename T>
122122
enable_if_base_binary<T, Status> Visit(const T& t) {
123-
return GenerateAscii(t);
123+
return GenerateUtf8(t);
124124
}
125125

126-
Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
126+
Status Visit(const BinaryViewType& t) { return GenerateUtf8(t); }
127127

128128
template <typename T>
129129
enable_if_list_like<T, Status> Visit(const T& t) {

cpp/src/arrow/testing/random.cc

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1475,4 +1475,81 @@ void rand_month_day_nanos(int64_t N,
14751475
});
14761476
}
14771477

1478+
std::string RandomUtf8String(random::SeedType seed, int num_chars) {
1479+
arrow::random::pcg32 gen(seed);
1480+
std::string s;
1481+
s.reserve(num_chars * 3); // Reserve for average 3 bytes per codepoint
1482+
1483+
std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
1484+
std::bernoulli_distribution bmp_range_dist(0.5);
1485+
std::uniform_int_distribution<uint32_t> bmp_lower_dist(0x0020, 0xD7FF);
1486+
std::uniform_int_distribution<uint32_t> bmp_upper_dist(0xE000, 0xFFFD);
1487+
std::uniform_int_distribution<uint32_t> smp_dist(0x10000, 0x1FFFF);
1488+
std::uniform_int_distribution<uint32_t> sip_dist(0x20000, 0x2FFFF);
1489+
std::uniform_int_distribution<uint32_t> high_plane_dist(0x30000, 0x10FFFF);
1490+
1491+
for (int i = 0; i < num_chars; ++i) {
1492+
uint32_t codepoint;
1493+
uint32_t plane = plane_dist(gen);
1494+
1495+
if (plane == 0) {
1496+
// Basic Multilingual Plane (BMP): U+0000 to U+FFFF
1497+
// Exclude surrogate code points (U+D800 to U+DFFF)
1498+
// https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71)
1499+
// Exclude control chars below U+0020 for readability
1500+
// Generate from two ranges with equal probability (overrepresents the smaller
1501+
// upper range):
1502+
// - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
1503+
// - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
1504+
if (bmp_range_dist(gen)) {
1505+
// Lower range: U+0020 to U+D7FF (before surrogate range)
1506+
codepoint = bmp_lower_dist(gen);
1507+
} else {
1508+
// Upper range: U+E000 to U+FFFD (after surrogate range)
1509+
// Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
1510+
// Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
1511+
// as they are valid Unicode scalar values per the Unicode Standard
1512+
codepoint = bmp_upper_dist(gen);
1513+
}
1514+
} else if (plane == 1) {
1515+
// Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
1516+
// https://www.unicode.org/roadmaps/smp/
1517+
codepoint = smp_dist(gen);
1518+
} else if (plane == 2) {
1519+
// Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
1520+
// https://www.unicode.org/roadmaps/sip/
1521+
codepoint = sip_dist(gen);
1522+
} else {
1523+
// Planes 3–16: U+30000–U+10FFFF
1524+
// Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
1525+
// Max valid Unicode codepoint is U+10FFFF per the Standard
1526+
// https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
1527+
codepoint = high_plane_dist(gen);
1528+
}
1529+
1530+
// Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
1531+
// https://www.rfc-editor.org/rfc/rfc3629.html#section-3
1532+
if (codepoint <= 0x7F) {
1533+
// 1-byte sequence: 0xxxxxxx
1534+
s.push_back(static_cast<char>(codepoint));
1535+
} else if (codepoint <= 0x7FF) {
1536+
// 2-byte sequence: 110xxxxx 10xxxxxx
1537+
s.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
1538+
s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
1539+
} else if (codepoint <= 0xFFFF) {
1540+
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
1541+
s.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
1542+
s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
1543+
s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
1544+
} else {
1545+
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1546+
s.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
1547+
s.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
1548+
s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
1549+
s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
1550+
}
1551+
}
1552+
return s;
1553+
}
1554+
14781555
} // namespace arrow

cpp/src/arrow/testing/random.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,21 @@ ARROW_TESTING_EXPORT
729729
void rand_month_day_nanos(int64_t N,
730730
std::vector<MonthDayNanoIntervalType::MonthDayNanos>* out);
731731

732+
/// \brief Generate a random UTF-8 encoded string
733+
///
734+
/// Generates a string with valid UTF-8 encoding from random Unicode scalar values.
735+
/// The generated string contains num_chars code points sampled uniformly
736+
/// across the Basic Multilingual Plane (BMP), Supplementary Multilingual Plane (SMP),
737+
/// Supplementary Ideographic Plane (SIP), and higher planes (up to U+10FFFF).
738+
/// Surrogate code points (U+D800-U+DFFF) are excluded as they are not valid
739+
/// Unicode scalar values.
740+
///
741+
/// \param[in] seed Random seed for reproducibility
742+
/// \param[in] num_chars Number of Unicode code points to generate
743+
/// \return a generated UTF-8 encoded string
744+
ARROW_TESTING_EXPORT
745+
std::string RandomUtf8String(random::SeedType seed, int num_chars);
746+
732747
template <typename T, typename U>
733748
void randint(int64_t N, T lower, T upper, std::vector<U>* out) {
734749
const int random_seed = 0;

0 commit comments

Comments
 (0)