Skip to content

Commit 579758e

Browse files
committed
GH-49329: [C++][Parquet][CI] Add fuzz target for encoder/decoder roundtrip
1 parent 2fcc3ec commit 579758e

23 files changed

Lines changed: 1032 additions & 85 deletions

ci/docker/ubuntu-22.04-cpp.dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ RUN apt-get update -y -q && \
120120
rsync \
121121
tzdata \
122122
uuid-runtime \
123+
unzip \
123124
wget \
124125
xz-utils && \
125126
apt-get clean && \

ci/docker/ubuntu-24.04-cpp.dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ RUN apt-get update -y -q && \
122122
tzdata \
123123
tzdata-legacy \
124124
uuid-runtime \
125+
unzip \
125126
wget && \
126127
apt-get clean && \
127128
rm -rf /var/lib/apt/lists*

ci/scripts/cpp_test.sh

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,36 @@ fi
180180

181181
if [ "${ARROW_FUZZING}" == "ON" ]; then
182182
# Fuzzing regression tests
183+
184+
# This will display any errors generated during fuzzing. These errors are
185+
# usually not bugs (most fuzz files are invalid and hence generate errors
186+
# when trying to read them), which is why they are hidden by default when
187+
# fuzzing.
188+
export ARROW_FUZZING_VERBOSITY=1
183189
# Some fuzz regression files may trigger huge memory allocations,
184190
# let the allocator return null instead of aborting.
185191
export ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1"
186-
export ARROW_FUZZING_VERBOSITY=1
192+
193+
# 1. Generate seed corpuses
194+
"${source_dir}/build-support/fuzzing/generate_corpuses.sh" "${binary_output_dir}"
195+
196+
# 2. Run fuzz targets on seed corpus entries
197+
function run_fuzz_target_on_seed_corpus() {
198+
fuzz_target_basename=$1
199+
corpus_dir=${binary_output_dir}/${fuzz_target_basename}_seed_corpus
200+
mkdir -p "${corpus_dir}"
201+
rm -f "${corpus_dir}"/*
202+
unzip "${binary_output_dir}"/"${fuzz_target_basename}"_seed_corpus.zip -d "${corpus_dir}"
203+
"${binary_output_dir}"/"${fuzz_target_basename}" -rss_limit_mb=4000 "${corpus_dir}"/*
204+
}
205+
run_fuzz_target_on_seed_corpus arrow-csv-fuzz
206+
run_fuzz_target_on_seed_corpus arrow-ipc-file-fuzz
207+
run_fuzz_target_on_seed_corpus arrow-ipc-stream-fuzz
208+
run_fuzz_target_on_seed_corpus arrow-ipc-tensor-stream-fuzz
209+
run_fuzz_target_on_seed_corpus parquet-arrow-fuzz
210+
run_fuzz_target_on_seed_corpus parquet-encoding-fuzz
211+
212+
# 3. Run fuzz targets on regression files from arrow-testing
187213
# Run golden IPC integration files: these should ideally load without errors,
188214
# though some very old ones carry invalid data (such as decimal values
189215
# larger than their advertised precision).

cpp/build-support/fuzzing/generate_corpuses.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ rm -rf ${CORPUS_DIR}
5656
${OUT}/arrow-ipc-generate-tensor-fuzz-corpus -stream ${CORPUS_DIR}
5757
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-tensor-stream-fuzz_seed_corpus.zip
5858

59-
# Parquet
59+
# Parquet file-level fuzzer
6060

6161
rm -rf ${CORPUS_DIR}
6262
${OUT}/parquet-arrow-generate-fuzz-corpus ${CORPUS_DIR}
@@ -65,6 +65,12 @@ cp ${ARROW_CPP}/submodules/parquet-testing/data/*.parquet ${CORPUS_DIR}
6565
cp ${ARROW_CPP}/submodules/parquet-testing/bad_data/*.parquet ${CORPUS_DIR}
6666
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/parquet-arrow-fuzz_seed_corpus.zip
6767

68+
# Parquet encoding fuzzer
69+
70+
rm -rf ${CORPUS_DIR}
71+
${OUT}/parquet-generate-encoding-fuzz-corpus ${CORPUS_DIR}
72+
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/parquet-encoding-fuzz_seed_corpus.zip
73+
6874
# CSV
6975

7076
rm -rf ${PANDAS_DIR}

cpp/src/arrow/util/macros.h

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -183,28 +183,18 @@
183183
#endif
184184

185185
// ----------------------------------------------------------------------
186+
// Macros to enforce struct member packing
186187

187-
// macros to disable padding
188-
// these macros are portable across different compilers and platforms
189-
//[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355]
190-
#if !defined(MANUALLY_ALIGNED_STRUCT)
191-
# if defined(_MSC_VER)
192-
# define MANUALLY_ALIGNED_STRUCT(alignment) \
193-
__pragma(pack(1)); \
194-
struct __declspec(align(alignment))
195-
# define STRUCT_END(name, size) \
196-
__pragma(pack()); \
197-
static_assert(sizeof(name) == size, "compiler breaks packing rules")
198-
# elif defined(__GNUC__) || defined(__clang__)
199-
# define MANUALLY_ALIGNED_STRUCT(alignment) \
200-
_Pragma("pack(1)") struct __attribute__((aligned(alignment)))
201-
# define STRUCT_END(name, size) \
202-
_Pragma("pack()") static_assert(sizeof(name) == size, \
203-
"compiler breaks packing rules")
204-
# else
205-
# error Unknown compiler, please define structure alignment macros
206-
# endif
207-
#endif // !defined(MANUALLY_ALIGNED_STRUCT)
188+
#if defined(__GNUC__)
189+
# define ARROW_PACKED_START(KEYWORD, ...) KEYWORD [[gnu::packed]] __VA_ARGS__
190+
# define ARROW_PACKED_END
191+
#elif defined(_MSC_VER)
192+
# define ARROW_PACKED_START(KEYWORD, ...) _Pragma("pack(push, 1)") KEYWORD __VA_ARGS__
193+
# define ARROW_PACKED_END _Pragma("pack(pop)")
194+
#else
195+
# define ARROW_PACKED_START(KEYWORD, ...) KEYWORD __VA_ARGS__
196+
# define ARROW_PACKED_END
197+
#endif
208198

209199
// ----------------------------------------------------------------------
210200
// Convenience macro disabling a particular UBSan check in a function

cpp/src/parquet/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ endif()
151151
# Library config
152152

153153
set(PARQUET_SRCS
154+
arrow/fuzz_encoding_internal.cc
154155
arrow/fuzz_internal.cc
155156
arrow/path_internal.cc
156157
arrow/reader.cc

cpp/src/parquet/arrow/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,19 @@ arrow_install_all_headers("parquet/arrow")
1919

2020
if(ARROW_BUILD_FUZZING_UTILITIES)
2121
add_executable(parquet-arrow-generate-fuzz-corpus generate_fuzz_corpus.cc)
22+
add_executable(parquet-generate-encoding-fuzz-corpus generate_encoding_fuzz_corpus.cc)
2223
if(ARROW_BUILD_STATIC)
2324
target_link_libraries(parquet-arrow-generate-fuzz-corpus parquet_static
2425
arrow_testing_static)
26+
target_link_libraries(parquet-generate-encoding-fuzz-corpus parquet_static
27+
arrow_testing_static)
2528
else()
2629
target_link_libraries(parquet-arrow-generate-fuzz-corpus parquet_shared
2730
arrow_testing_shared)
31+
target_link_libraries(parquet-generate-encoding-fuzz-corpus parquet_shared
32+
arrow_testing_shared)
2833
endif()
2934
endif()
3035

3136
add_parquet_fuzz_target(fuzz PREFIX "parquet-arrow")
37+
add_parquet_fuzz_target(encoding_fuzz PREFIX "parquet")
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "arrow/status.h"
19+
#include "arrow/util/fuzz_internal.h"
20+
#include "parquet/arrow/fuzz_encoding_internal.h"
21+
22+
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
23+
auto status =
24+
parquet::fuzzing::internal::FuzzEncoding(data, static_cast<int64_t>(size));
25+
arrow::internal::LogFuzzStatus(status, data, static_cast<int64_t>(size));
26+
return 0;
27+
}

0 commit comments

Comments
 (0)