Skip to content

Commit b681f2b

Browse files
committed
Address a review comment
1 parent e34d25f commit b681f2b

File tree

3 files changed

+20
-12
lines changed

3 files changed

+20
-12
lines changed

cpp/src/arrow/json/test_common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,8 @@ struct GenerateImpl {
113113

114114
Status GenerateUtf8(const DataType&) {
115115
auto num_codepoints = std::poisson_distribution<>{4}(e);
116-
std::string s = RandomUtf8String(num_codepoints);
116+
auto seed = std::uniform_int_distribution<uint32_t>{}(e);
117+
std::string s = RandomUtf8String(seed, num_codepoints);
117118
return OK(writer.String(s));
118119
}
119120

cpp/src/arrow/testing/random.cc

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1475,15 +1475,21 @@ void rand_month_day_nanos(int64_t N,
14751475
});
14761476
}
14771477

1478-
std::string RandomUtf8String(int num_chars) {
1479-
std::random_device rd;
1480-
std::default_random_engine gen(rd());
1478+
std::string RandomUtf8String(random::SeedType seed, int num_chars) {
1479+
arrow::random::pcg32 gen(seed);
14811480
std::string s;
14821481
s.reserve(num_chars * 3); // Reserve for average 3 bytes per codepoint
14831482

1483+
std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
1484+
std::bernoulli_distribution bmp_range_dist(0.5);
1485+
std::uniform_int_distribution<uint32_t> bmp_lower_dist(0x0020, 0xD7FF);
1486+
std::uniform_int_distribution<uint32_t> bmp_upper_dist(0xE000, 0xFFFD);
1487+
std::uniform_int_distribution<uint32_t> smp_dist(0x10000, 0x1FFFF);
1488+
std::uniform_int_distribution<uint32_t> sip_dist(0x20000, 0x2FFFF);
1489+
std::uniform_int_distribution<uint32_t> high_plane_dist(0x30000, 0x10FFFF);
1490+
14841491
for (int i = 0; i < num_chars; ++i) {
14851492
uint32_t codepoint;
1486-
std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
14871493
uint32_t plane = plane_dist(gen);
14881494

14891495
if (plane == 0) {
@@ -1495,30 +1501,30 @@ std::string RandomUtf8String(int num_chars) {
14951501
// upper range):
14961502
// - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
14971503
// - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
1498-
if (std::bernoulli_distribution(0.5)(gen)) {
1504+
if (bmp_range_dist(gen)) {
14991505
// Lower range: U+0020 to U+D7FF (before surrogate range)
1500-
codepoint = std::uniform_int_distribution<uint32_t>(0x0020, 0xD7FF)(gen);
1506+
codepoint = bmp_lower_dist(gen);
15011507
} else {
15021508
// Upper range: U+E000 to U+FFFD (after surrogate range)
15031509
// Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
15041510
// Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
15051511
// as they are valid Unicode scalar values per the Unicode Standard
1506-
codepoint = std::uniform_int_distribution<uint32_t>(0xE000, 0xFFFD)(gen);
1512+
codepoint = bmp_upper_dist(gen);
15071513
}
15081514
} else if (plane == 1) {
15091515
// Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
15101516
// https://www.unicode.org/roadmaps/smp/
1511-
codepoint = std::uniform_int_distribution<uint32_t>(0x10000, 0x1FFFF)(gen);
1517+
codepoint = smp_dist(gen);
15121518
} else if (plane == 2) {
15131519
// Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
15141520
// https://www.unicode.org/roadmaps/sip/
1515-
codepoint = std::uniform_int_distribution<uint32_t>(0x20000, 0x2FFFF)(gen);
1521+
codepoint = sip_dist(gen);
15161522
} else {
15171523
// Planes 3–16: U+30000–U+10FFFF
15181524
// Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
15191525
// Max valid Unicode codepoint is U+10FFFF per the Standard
15201526
// https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
1521-
codepoint = std::uniform_int_distribution<uint32_t>(0x30000, 0x10FFFF)(gen);
1527+
codepoint = high_plane_dist(gen);
15221528
}
15231529

15241530
// Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)

cpp/src/arrow/testing/random.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -738,10 +738,11 @@ void rand_month_day_nanos(int64_t N,
738738
/// Surrogate code points (U+D800-U+DFFF) are excluded as they are not valid
739739
/// Unicode scalar values.
740740
///
741+
/// \param[in] seed Random seed for reproducibility
741742
/// \param[in] num_chars Number of Unicode code points to generate
742743
/// \return a generated UTF-8 encoded string
743744
ARROW_TESTING_EXPORT
744-
std::string RandomUtf8String(int num_chars);
745+
std::string RandomUtf8String(random::SeedType seed, int num_chars);
745746

746747
template <typename T, typename U>
747748
void randint(int64_t N, T lower, T upper, std::vector<U>* out) {

0 commit comments

Comments
 (0)