@@ -1475,15 +1475,21 @@ void rand_month_day_nanos(int64_t N,
14751475 });
14761476}
14771477
1478- std::string RandomUtf8String (int num_chars) {
1479- std::random_device rd;
1480- std::default_random_engine gen (rd ());
1478+ std::string RandomUtf8String (random::SeedType seed, int num_chars) {
1479+ arrow::random::pcg32 gen (seed);
14811480 std::string s;
14821481 s.reserve (num_chars * 3 ); // Reserve for average 3 bytes per codepoint
14831482
1483+ std::uniform_int_distribution<uint32_t > plane_dist (0 , 3 );
1484+ std::bernoulli_distribution bmp_range_dist (0.5 );
1485+ std::uniform_int_distribution<uint32_t > bmp_lower_dist (0x0020 , 0xD7FF );
1486+ std::uniform_int_distribution<uint32_t > bmp_upper_dist (0xE000 , 0xFFFD );
1487+ std::uniform_int_distribution<uint32_t > smp_dist (0x10000 , 0x1FFFF );
1488+ std::uniform_int_distribution<uint32_t > sip_dist (0x20000 , 0x2FFFF );
1489+ std::uniform_int_distribution<uint32_t > high_plane_dist (0x30000 , 0x10FFFF );
1490+
14841491 for (int i = 0 ; i < num_chars; ++i) {
14851492 uint32_t codepoint;
1486- std::uniform_int_distribution<uint32_t > plane_dist (0 , 3 );
14871493 uint32_t plane = plane_dist (gen);
14881494
14891495 if (plane == 0 ) {
@@ -1495,30 +1501,30 @@ std::string RandomUtf8String(int num_chars) {
14951501 // upper range):
14961502 // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
14971503 // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
1498- if (std::bernoulli_distribution ( 0.5 ) (gen)) {
1504+ if (bmp_range_dist (gen)) {
14991505 // Lower range: U+0020 to U+D7FF (before surrogate range)
1500- codepoint = std::uniform_int_distribution< uint32_t >( 0x0020 , 0xD7FF ) (gen);
1506+ codepoint = bmp_lower_dist (gen);
15011507 } else {
15021508 // Upper range: U+E000 to U+FFFD (after surrogate range)
15031509 // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
15041510 // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
15051511 // as they are valid Unicode scalar values per the Unicode Standard
1506- codepoint = std::uniform_int_distribution< uint32_t >( 0xE000 , 0xFFFD ) (gen);
1512+ codepoint = bmp_upper_dist (gen);
15071513 }
15081514 } else if (plane == 1 ) {
15091515 // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
15101516 // https://www.unicode.org/roadmaps/smp/
1511- codepoint = std::uniform_int_distribution< uint32_t >( 0x10000 , 0x1FFFF ) (gen);
1517+ codepoint = smp_dist (gen);
15121518 } else if (plane == 2 ) {
15131519 // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
15141520 // https://www.unicode.org/roadmaps/sip/
1515- codepoint = std::uniform_int_distribution< uint32_t >( 0x20000 , 0x2FFFF ) (gen);
1521+ codepoint = sip_dist (gen);
15161522 } else {
15171523 // Planes 3–16: U+30000–U+10FFFF
15181524 // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
15191525 // Max valid Unicode codepoint is U+10FFFF per the Standard
15201526 // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
1521- codepoint = std::uniform_int_distribution< uint32_t >( 0x30000 , 0x10FFFF ) (gen);
1527+ codepoint = high_plane_dist (gen);
15221528 }
15231529
15241530 // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
0 commit comments