From 64f91009f290b5495f5de8475f23db697f610bc7 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Thu, 22 Jan 2026 18:19:30 +0900
Subject: [PATCH 1/3] [C++] Generate proper UTF-8 strings in JSON test
 utilities

---
 cpp/src/arrow/json/test_common.h | 79 +++++++++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 7 deletions(-)
diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h
index 423a0123c05..dbaaa9f82ce 100644
--- a/cpp/src/arrow/json/test_common.h
+++ b/cpp/src/arrow/json/test_common.h
@@ -110,20 +110,85 @@ struct GenerateImpl {
     return OK(writer.Double(val));
   }
 
-  Status GenerateAscii(const DataType&) {
-    auto size = std::poisson_distribution<>{4}(e);
-    std::uniform_int_distribution<uint16_t> gen_char(32, 126);  // FIXME generate UTF8
-    std::string s(size, '\0');
-    for (char& ch : s) ch = static_cast<char>(gen_char(e));
+  Status GenerateUtf8(const DataType&) {
+    // Generate random UTF-8 encoded strings from valid Unicode scalar values.
+    auto num_codepoints = std::poisson_distribution<>{4}(e);
+    std::string s;
+    s.reserve(num_codepoints * 3);
+
+    for (int i = 0; i < num_codepoints; ++i) {
+      uint32_t codepoint;
+      std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
+      uint32_t plane = plane_dist(e);
+
+      if (plane == 0) {
+        // Basic Multilingual Plane (BMP): U+0000 to U+FFFF
+        // Exclude surrogate code points (U+D800 to U+DFFF)
+        // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71)
+        // Exclude control chars below U+0020 for readability
+        // Generate from two ranges with equal probability (overrepresents the smaller
+        // upper range):
+        // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
+        // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
+        if (std::bernoulli_distribution(0.5)(e)) {
+          // Lower range: U+0020 to U+D7FF (before surrogate range)
+          codepoint = std::uniform_int_distribution<uint32_t>(0x0020, 0xD7FF)(e);
+        } else {
+          // Upper range: U+E000 to U+FFFD (after surrogate range)
+          // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
+          // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
+          // as they are valid Unicode scalar values per the Unicode Standard
+          codepoint = std::uniform_int_distribution<uint32_t>(0xE000, 0xFFFD)(e);
+        }
+      } else if (plane == 1) {
+        // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
+        // https://www.unicode.org/roadmaps/smp/
+        codepoint = std::uniform_int_distribution<uint32_t>(0x10000, 0x1FFFF)(e);
+      } else if (plane == 2) {
+        // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
+        // https://www.unicode.org/roadmaps/sip/
+        codepoint = std::uniform_int_distribution<uint32_t>(0x20000, 0x2FFFF)(e);
+      } else {
+        // Planes 3–16: U+30000–U+10FFFF
+        // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
+        // Max valid Unicode codepoint is U+10FFFF per the Standard
+        // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
+        codepoint = std::uniform_int_distribution<uint32_t>(0x30000, 0x10FFFF)(e);
+      }
+
+      // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
+      // https://www.rfc-editor.org/rfc/rfc3629.html#section-3
+      if (codepoint <= 0x7F) {
+        // 1-byte sequence: 0xxxxxxx
+        s.push_back(static_cast<char>(codepoint));
+      } else if (codepoint <= 0x7FF) {
+        // 2-byte sequence: 110xxxxx 10xxxxxx
+        s.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
+        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+      } else if (codepoint <= 0xFFFF) {
+        // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+        s.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
+        s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+      } else {
+        // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        s.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
+        s.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
+        s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+      }
+    }
+    // Using c_str() is safe here because generation excludes U+0000 (no embedded nulls).
+    // U+0000 can only exist in plane 0 (BMP), and BMP generation starts at U+0020.
     return OK(writer.String(s.c_str()));
   }
 
   template <typename T>
   enable_if_base_binary<T, Status> Visit(const T& t) {
-    return GenerateAscii(t);
+    return GenerateUtf8(t);
   }
 
-  Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
+  Status Visit(const BinaryViewType& t) { return GenerateUtf8(t); }
 
   template <typename T>
   enable_if_list_like<T, Status> Visit(const T& t) {

From e34d25f94632cc06e53c09ebb44db2746d1a60c8 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Wed, 28 Jan 2026 16:49:07 +0900
Subject: [PATCH 2/3] review comment

---
 cpp/src/arrow/json/test_common.h | 72 ++------------------------------
 cpp/src/arrow/testing/random.cc  | 71 +++++++++++++++++++++++++++++++
 cpp/src/arrow/testing/random.h   | 14 +++++++
 3 files changed, 88 insertions(+), 69 deletions(-)

diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h
index dbaaa9f82ce..c5cb5ec3889 100644
--- a/cpp/src/arrow/json/test_common.h
+++ b/cpp/src/arrow/json/test_common.h
@@ -33,6 +33,7 @@
 #include "arrow/json/parser.h"
 #include "arrow/json/rapidjson_defs.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/visit_type_inline.h"
@@ -111,76 +112,9 @@ struct GenerateImpl {
   }
 
   Status GenerateUtf8(const DataType&) {
-    // Generate random UTF-8 encoded strings from valid Unicode scalar values.
     auto num_codepoints = std::poisson_distribution<>{4}(e);
-    std::string s;
-    s.reserve(num_codepoints * 3);
-
-    for (int i = 0; i < num_codepoints; ++i) {
-      uint32_t codepoint;
-      std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
-      uint32_t plane = plane_dist(e);
-
-      if (plane == 0) {
-        // Basic Multilingual Plane (BMP): U+0000 to U+FFFF
-        // Exclude surrogate code points (U+D800 to U+DFFF)
-        // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71)
-        // Exclude control chars below U+0020 for readability
-        // Generate from two ranges with equal probability (overrepresents the smaller
-        // upper range):
-        // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
-        // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
-        if (std::bernoulli_distribution(0.5)(e)) {
-          // Lower range: U+0020 to U+D7FF (before surrogate range)
-          codepoint = std::uniform_int_distribution<uint32_t>(0x0020, 0xD7FF)(e);
-        } else {
-          // Upper range: U+E000 to U+FFFD (after surrogate range)
-          // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
-          // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
-          // as they are valid Unicode scalar values per the Unicode Standard
-          codepoint = std::uniform_int_distribution<uint32_t>(0xE000, 0xFFFD)(e);
-        }
-      } else if (plane == 1) {
-        // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
-        // https://www.unicode.org/roadmaps/smp/
-        codepoint = std::uniform_int_distribution<uint32_t>(0x10000, 0x1FFFF)(e);
-      } else if (plane == 2) {
-        // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
-        // https://www.unicode.org/roadmaps/sip/
-        codepoint = std::uniform_int_distribution<uint32_t>(0x20000, 0x2FFFF)(e);
-      } else {
-        // Planes 3–16: U+30000–U+10FFFF
-        // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
-        // Max valid Unicode codepoint is U+10FFFF per the Standard
-        // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
-        codepoint = std::uniform_int_distribution<uint32_t>(0x30000, 0x10FFFF)(e);
-      }
-
-      // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
-      // https://www.rfc-editor.org/rfc/rfc3629.html#section-3
-      if (codepoint <= 0x7F) {
-        // 1-byte sequence: 0xxxxxxx
-        s.push_back(static_cast<char>(codepoint));
-      } else if (codepoint <= 0x7FF) {
-        // 2-byte sequence: 110xxxxx 10xxxxxx
-        s.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
-        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
-      } else if (codepoint <= 0xFFFF) {
-        // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
-        s.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
-        s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
-        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
-      } else {
-        // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-        s.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
-        s.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
-        s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
-        s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
-      }
-    }
-    // Using c_str() is safe here because generation excludes U+0000 (no embedded nulls).
-    // U+0000 can only exist in plane 0 (BMP), and BMP generation starts at U+0020.
-    return OK(writer.String(s.c_str()));
+    std::string s = RandomUtf8String(num_codepoints);
+    return OK(writer.String(s));
   }
 
   template <typename T>
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index c50387e4909..1d7e84790ac 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -1475,4 +1475,75 @@ void rand_month_day_nanos(int64_t N,
   });
 }
 
+std::string RandomUtf8String(int num_chars) {
+  std::random_device rd;
+  std::default_random_engine gen(rd());
+  std::string s;
+  s.reserve(num_chars * 3);  // Reserve for average 3 bytes per codepoint
+
+  for (int i = 0; i < num_chars; ++i) {
+    uint32_t codepoint;
+    std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
+    uint32_t plane = plane_dist(gen);
+
+    if (plane == 0) {
+      // Basic Multilingual Plane (BMP): U+0000 to U+FFFF
+      // Exclude surrogate code points (U+D800 to U+DFFF)
+      // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71)
+      // Exclude control chars below U+0020 for readability
+      // Generate from two ranges with equal probability (overrepresents the smaller
+      // upper range):
+      // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
+      // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
+      if (std::bernoulli_distribution(0.5)(gen)) {
+        // Lower range: U+0020 to U+D7FF (before surrogate range)
+        codepoint = std::uniform_int_distribution<uint32_t>(0x0020, 0xD7FF)(gen);
+      } else {
+        // Upper range: U+E000 to U+FFFD (after surrogate range)
+        // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
+        // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
+        // as they are valid Unicode scalar values per the Unicode Standard
+        codepoint = std::uniform_int_distribution<uint32_t>(0xE000, 0xFFFD)(gen);
+      }
+    } else if (plane == 1) {
+      // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
+      // https://www.unicode.org/roadmaps/smp/
+      codepoint = std::uniform_int_distribution<uint32_t>(0x10000, 0x1FFFF)(gen);
+    } else if (plane == 2) {
+      // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
+      // https://www.unicode.org/roadmaps/sip/
+      codepoint = std::uniform_int_distribution<uint32_t>(0x20000, 0x2FFFF)(gen);
+    } else {
+      // Planes 3–16: U+30000–U+10FFFF
+      // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
+      // Max valid Unicode codepoint is U+10FFFF per the Standard
+      // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
+      codepoint = std::uniform_int_distribution<uint32_t>(0x30000, 0x10FFFF)(gen);
+    }
+
+    // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
+    // https://www.rfc-editor.org/rfc/rfc3629.html#section-3
+    if (codepoint <= 0x7F) {
+      // 1-byte sequence: 0xxxxxxx
+      s.push_back(static_cast<char>(codepoint));
+    } else if (codepoint <= 0x7FF) {
+      // 2-byte sequence: 110xxxxx 10xxxxxx
+      s.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
+      s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+    } else if (codepoint <= 0xFFFF) {
+      // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+      s.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
+      s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+      s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+    } else {
+      // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      s.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
+      s.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
+      s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+      s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+    }
+  }
+  return s;
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index d9122915a09..854e5443bfa 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -729,6 +729,20 @@ ARROW_TESTING_EXPORT
 void rand_month_day_nanos(int64_t N,
                           std::vector<MonthDayNanoIntervalType::MonthDayNanos>* out);
 
+/// \brief Generate a random UTF-8 encoded string
+///
+/// Generates a string with valid UTF-8 encoding from random Unicode scalar values.
+/// The generated string contains num_chars code points sampled uniformly
+/// across the Basic Multilingual Plane (BMP), Supplementary Multilingual Plane (SMP),
+/// Supplementary Ideographic Plane (SIP), and higher planes (up to U+10FFFF).
+/// Surrogate code points (U+D800-U+DFFF) are excluded as they are not valid
+/// Unicode scalar values.
+///
+/// \param[in] num_chars Number of Unicode code points to generate
+/// \return a generated UTF-8 encoded string
+ARROW_TESTING_EXPORT
+std::string RandomUtf8String(int num_chars);
+
 template <typename T, typename U>
 void randint(int64_t N, T lower, T upper, std::vector<U>* out) {
   const int random_seed = 0;

From b681f2b59c9a5f3827b6ed73e8027262b977fbe0 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Thu, 29 Jan 2026 15:26:27 +0900
Subject: [PATCH 3/3] Address a review comment

---
 cpp/src/arrow/json/test_common.h |  3 ++-
 cpp/src/arrow/testing/random.cc  | 26 ++++++++++++++++----------
 cpp/src/arrow/testing/random.h   |  3 ++-
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h
index c5cb5ec3889..ab2ce9cdc74 100644
--- a/cpp/src/arrow/json/test_common.h
+++ b/cpp/src/arrow/json/test_common.h
@@ -113,7 +113,8 @@ struct GenerateImpl {
 
   Status GenerateUtf8(const DataType&) {
     auto num_codepoints = std::poisson_distribution<>{4}(e);
-    std::string s = RandomUtf8String(num_codepoints);
+    auto seed = std::uniform_int_distribution<uint32_t>{}(e);
+    std::string s = RandomUtf8String(seed, num_codepoints);
     return OK(writer.String(s));
   }
 
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index 1d7e84790ac..f73dbd5bbf7 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -1475,15 +1475,21 @@ void rand_month_day_nanos(int64_t N,
   });
 }
 
-std::string RandomUtf8String(int num_chars) {
-  std::random_device rd;
-  std::default_random_engine gen(rd());
+std::string RandomUtf8String(random::SeedType seed, int num_chars) {
+  arrow::random::pcg32 gen(seed);
   std::string s;
   s.reserve(num_chars * 3);  // Reserve for average 3 bytes per codepoint
 
+  std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
+  std::bernoulli_distribution bmp_range_dist(0.5);
+  std::uniform_int_distribution<uint32_t> bmp_lower_dist(0x0020, 0xD7FF);
+  std::uniform_int_distribution<uint32_t> bmp_upper_dist(0xE000, 0xFFFD);
+  std::uniform_int_distribution<uint32_t> smp_dist(0x10000, 0x1FFFF);
+  std::uniform_int_distribution<uint32_t> sip_dist(0x20000, 0x2FFFF);
+  std::uniform_int_distribution<uint32_t> high_plane_dist(0x30000, 0x10FFFF);
+
   for (int i = 0; i < num_chars; ++i) {
     uint32_t codepoint;
-    std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
     uint32_t plane = plane_dist(gen);
 
     if (plane == 0) {
@@ -1495,30 +1501,30 @@ std::string RandomUtf8String(int num_chars) {
       // upper range):
       // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
       // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
-      if (std::bernoulli_distribution(0.5)(gen)) {
+      if (bmp_range_dist(gen)) {
         // Lower range: U+0020 to U+D7FF (before surrogate range)
-        codepoint = std::uniform_int_distribution<uint32_t>(0x0020, 0xD7FF)(gen);
+        codepoint = bmp_lower_dist(gen);
       } else {
         // Upper range: U+E000 to U+FFFD (after surrogate range)
         // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
         // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
         // as they are valid Unicode scalar values per the Unicode Standard
-        codepoint = std::uniform_int_distribution<uint32_t>(0xE000, 0xFFFD)(gen);
+        codepoint = bmp_upper_dist(gen);
       }
     } else if (plane == 1) {
       // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
       // https://www.unicode.org/roadmaps/smp/
-      codepoint = std::uniform_int_distribution<uint32_t>(0x10000, 0x1FFFF)(gen);
+      codepoint = smp_dist(gen);
     } else if (plane == 2) {
       // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
       // https://www.unicode.org/roadmaps/sip/
-      codepoint = std::uniform_int_distribution<uint32_t>(0x20000, 0x2FFFF)(gen);
+      codepoint = sip_dist(gen);
     } else {
       // Planes 3–16: U+30000–U+10FFFF
       // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
       // Max valid Unicode codepoint is U+10FFFF per the Standard
       // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
-      codepoint = std::uniform_int_distribution<uint32_t>(0x30000, 0x10FFFF)(gen);
+      codepoint = high_plane_dist(gen);
     }
 
     // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index 854e5443bfa..f820e643986 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -738,10 +738,11 @@ void rand_month_day_nanos(int64_t N,
 /// Surrogate code points (U+D800-U+DFFF) are excluded as they are not valid
 /// Unicode scalar values.
 ///
+/// \param[in] seed Random seed for reproducibility
 /// \param[in] num_chars Number of Unicode code points to generate
 /// \return a generated UTF-8 encoded string
 ARROW_TESTING_EXPORT
-std::string RandomUtf8String(int num_chars);
+std::string RandomUtf8String(random::SeedType seed, int num_chars);
 
 template <typename T, typename U>
 void randint(int64_t N, T lower, T upper, std::vector<U>* out) {