GH-48941: [C++] Generate proper UTF-8 strings in JSON test utilities (#48943)

HyukjinKwon · web-flow · commit 7dacbd048473 · 2026-02-03T19:13:30.000+01:00
### Rationale for this change The JSON test utility `GenerateAscii` was only generating ASCII characters. Should better have the test coverage for proper UTF-8 and Unicode handling. ### What changes are included in this PR? Replaced ASCII-only generation with proper UTF-8 string generation that produces valid Unicode scalar values across all planes (BMP, SMP, SIP, planes 3-16), correctly encoded per RFC 3629. Added that function as an util. ### Are these changes tested? There are existent tests for JSON. ### Are there any user-facing changes? No, test-only. * GitHub Issue: #48941 Authored-by: Hyukjin Kwon <gurwls223@apache.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h
@@ -33,6 +33,7 @@
 #include "arrow/json/parser.h"
 #include "arrow/json/rapidjson_defs.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/visit_type_inline.h"
@@ -110,20 +111,19 @@ struct GenerateImpl {
     return OK(writer.Double(val));
   }
 
-  Status GenerateAscii(const DataType&) {
-    auto size = std::poisson_distribution<>{4}(e);
-    std::uniform_int_distribution<uint16_t> gen_char(32, 126);  // FIXME generate UTF8
-    std::string s(size, '\0');
-    for (char& ch : s) ch = static_cast<char>(gen_char(e));
-    return OK(writer.String(s.c_str()));
+  Status GenerateUtf8(const DataType&) {
+    auto num_codepoints = std::poisson_distribution<>{4}(e);
+    auto seed = std::uniform_int_distribution<uint32_t>{}(e);
+    std::string s = RandomUtf8String(seed, num_codepoints);
+    return OK(writer.String(s));
   }
 
   template <typename T>
   enable_if_base_binary<T, Status> Visit(const T& t) {
-    return GenerateAscii(t);
+    return GenerateUtf8(t);
   }
 
-  Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
+  Status Visit(const BinaryViewType& t) { return GenerateUtf8(t); }
 
   template <typename T>
   enable_if_list_like<T, Status> Visit(const T& t) {
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
@@ -1475,4 +1475,81 @@ void rand_month_day_nanos(int64_t N,
   });
 }
 
+std::string RandomUtf8String(random::SeedType seed, int num_chars) {
+  arrow::random::pcg32 gen(seed);
+  std::string s;
+  s.reserve(num_chars * 3);  // Reserve for average 3 bytes per codepoint
+
+  std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
+  std::bernoulli_distribution bmp_range_dist(0.5);
+  std::uniform_int_distribution<uint32_t> bmp_lower_dist(0x0020, 0xD7FF);
+  std::uniform_int_distribution<uint32_t> bmp_upper_dist(0xE000, 0xFFFD);
+  std::uniform_int_distribution<uint32_t> smp_dist(0x10000, 0x1FFFF);
+  std::uniform_int_distribution<uint32_t> sip_dist(0x20000, 0x2FFFF);
+  std::uniform_int_distribution<uint32_t> high_plane_dist(0x30000, 0x10FFFF);
+
+  for (int i = 0; i < num_chars; ++i) {
+    uint32_t codepoint;
+    uint32_t plane = plane_dist(gen);
+
+    if (plane == 0) {
+      // Basic Multilingual Plane (BMP): U+0000 to U+FFFF
+      // Exclude surrogate code points (U+D800 to U+DFFF)
+      // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71)
+      // Exclude control chars below U+0020 for readability
+      // Generate from two ranges with equal probability (overrepresents the smaller
+      // upper range):
+      // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
+      // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
+      if (bmp_range_dist(gen)) {
+        // Lower range: U+0020 to U+D7FF (before surrogate range)
+        codepoint = bmp_lower_dist(gen);
+      } else {
+        // Upper range: U+E000 to U+FFFD (after surrogate range)
+        // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
+        // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
+        // as they are valid Unicode scalar values per the Unicode Standard
+        codepoint = bmp_upper_dist(gen);
+      }
+    } else if (plane == 1) {
+      // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
+      // https://www.unicode.org/roadmaps/smp/
+      codepoint = smp_dist(gen);
+    } else if (plane == 2) {
+      // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
+      // https://www.unicode.org/roadmaps/sip/
+      codepoint = sip_dist(gen);
+    } else {
+      // Planes 3–16: U+30000–U+10FFFF
+      // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
+      // Max valid Unicode codepoint is U+10FFFF per the Standard
+      // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
+      codepoint = high_plane_dist(gen);
+    }
+
+    // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
+    // https://www.rfc-editor.org/rfc/rfc3629.html#section-3
+    if (codepoint <= 0x7F) {
+      // 1-byte sequence: 0xxxxxxx
+      s.push_back(static_cast<char>(codepoint));
+    } else if (codepoint <= 0x7FF) {
+      // 2-byte sequence: 110xxxxx 10xxxxxx
+      s.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
+      s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+    } else if (codepoint <= 0xFFFF) {
+      // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+      s.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
+      s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+      s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+    } else {
+      // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      s.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
+      s.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
+      s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
+      s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
+    }
+  }
+  return s;
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
@@ -729,6 +729,21 @@ ARROW_TESTING_EXPORT
 void rand_month_day_nanos(int64_t N,
                           std::vector<MonthDayNanoIntervalType::MonthDayNanos>* out);
 
+/// \brief Generate a random UTF-8 encoded string
+///
+/// Generates a string with valid UTF-8 encoding from random Unicode scalar values.
+/// The generated string contains num_chars code points sampled uniformly
+/// across the Basic Multilingual Plane (BMP), Supplementary Multilingual Plane (SMP),
+/// Supplementary Ideographic Plane (SIP), and higher planes (up to U+10FFFF).
+/// Surrogate code points (U+D800-U+DFFF) are excluded as they are not valid
+/// Unicode scalar values.
+///
+/// \param[in] seed Random seed for reproducibility
+/// \param[in] num_chars Number of Unicode code points to generate
+/// \return a generated UTF-8 encoded string
+ARROW_TESTING_EXPORT
+std::string RandomUtf8String(random::SeedType seed, int num_chars);
+
 template <typename T, typename U>
 void randint(int64_t N, T lower, T upper, std::vector<U>* out) {
   const int random_seed = 0;