Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions cpp/src/arrow/json/test_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "arrow/json/parser.h"
#include "arrow/json/rapidjson_defs.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/visit_type_inline.h"
Expand Down Expand Up @@ -110,20 +111,19 @@ struct GenerateImpl {
return OK(writer.Double(val));
}

Status GenerateAscii(const DataType&) {
auto size = std::poisson_distribution<>{4}(e);
std::uniform_int_distribution<uint16_t> gen_char(32, 126); // FIXME generate UTF8
std::string s(size, '\0');
for (char& ch : s) ch = static_cast<char>(gen_char(e));
return OK(writer.String(s.c_str()));
Status GenerateUtf8(const DataType&) {
auto num_codepoints = std::poisson_distribution<>{4}(e);
auto seed = std::uniform_int_distribution<uint32_t>{}(e);
std::string s = RandomUtf8String(seed, num_codepoints);
return OK(writer.String(s));
}

template <typename T>
enable_if_base_binary<T, Status> Visit(const T& t) {
return GenerateAscii(t);
return GenerateUtf8(t);
}

Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
Status Visit(const BinaryViewType& t) { return GenerateUtf8(t); }

template <typename T>
enable_if_list_like<T, Status> Visit(const T& t) {
Expand Down
77 changes: 77 additions & 0 deletions cpp/src/arrow/testing/random.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1475,4 +1475,81 @@ void rand_month_day_nanos(int64_t N,
});
}

std::string RandomUtf8String(random::SeedType seed, int num_chars) {
arrow::random::pcg32 gen(seed);
std::string s;
s.reserve(num_chars * 3); // Reserve for average 3 bytes per codepoint

std::uniform_int_distribution<uint32_t> plane_dist(0, 3);
std::bernoulli_distribution bmp_range_dist(0.5);
std::uniform_int_distribution<uint32_t> bmp_lower_dist(0x0020, 0xD7FF);
std::uniform_int_distribution<uint32_t> bmp_upper_dist(0xE000, 0xFFFD);
std::uniform_int_distribution<uint32_t> smp_dist(0x10000, 0x1FFFF);
std::uniform_int_distribution<uint32_t> sip_dist(0x20000, 0x2FFFF);
std::uniform_int_distribution<uint32_t> high_plane_dist(0x30000, 0x10FFFF);

for (int i = 0; i < num_chars; ++i) {
uint32_t codepoint;
uint32_t plane = plane_dist(gen);

if (plane == 0) {
// Basic Multilingual Plane (BMP): U+0000 to U+FFFF
// Exclude surrogate code points (U+D800 to U+DFFF)
// https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71)
// Exclude control chars below U+0020 for readability
// Generate from two ranges with equal probability (overrepresents the smaller
// upper range):
// - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability)
// - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability)
if (bmp_range_dist(gen)) {
// Lower range: U+0020 to U+D7FF (before surrogate range)
codepoint = bmp_lower_dist(gen);
} else {
// Upper range: U+E000 to U+FFFD (after surrogate range)
// Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF
// Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included
// as they are valid Unicode scalar values per the Unicode Standard
codepoint = bmp_upper_dist(gen);
}
} else if (plane == 1) {
// Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF
// https://www.unicode.org/roadmaps/smp/
codepoint = smp_dist(gen);
} else if (plane == 2) {
// Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF
// https://www.unicode.org/roadmaps/sip/
codepoint = sip_dist(gen);
} else {
// Planes 3–16: U+30000–U+10FFFF
// Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF
// Max valid Unicode codepoint is U+10FFFF per the Standard
// https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9)
codepoint = high_plane_dist(gen);
}

// Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition)
// https://www.rfc-editor.org/rfc/rfc3629.html#section-3
if (codepoint <= 0x7F) {
// 1-byte sequence: 0xxxxxxx
s.push_back(static_cast<char>(codepoint));
} else if (codepoint <= 0x7FF) {
// 2-byte sequence: 110xxxxx 10xxxxxx
s.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
} else if (codepoint <= 0xFFFF) {
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
s.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
} else {
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
s.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
s.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
s.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
s.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
}
}
return s;
}

} // namespace arrow
15 changes: 15 additions & 0 deletions cpp/src/arrow/testing/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,21 @@ ARROW_TESTING_EXPORT
void rand_month_day_nanos(int64_t N,
std::vector<MonthDayNanoIntervalType::MonthDayNanos>* out);

/// \brief Generate a random UTF-8 encoded string
///
/// Generates a string with valid UTF-8 encoding from random Unicode scalar values.
/// The generated string contains num_chars code points sampled uniformly
/// across the Basic Multilingual Plane (BMP), Supplementary Multilingual Plane (SMP),
/// Supplementary Ideographic Plane (SIP), and higher planes (up to U+10FFFF).
/// Surrogate code points (U+D800-U+DFFF) are excluded as they are not valid
/// Unicode scalar values.
///
/// \param[in] seed Random seed for reproducibility
/// \param[in] num_chars Number of Unicode code points to generate
/// \return a generated UTF-8 encoded string
ARROW_TESTING_EXPORT
std::string RandomUtf8String(random::SeedType seed, int num_chars);

template <typename T, typename U>
void randint(int64_t N, T lower, T upper, std::vector<U>* out) {
const int random_seed = 0;
Expand Down
Loading