Skip to content

Commit bbfbf5c

Browse files
committed
GH-49434: [C++][CI] Add golden integration files to IPC file fuzz corpus
1 parent cfbbf70 commit bbfbf5c

File tree

3 files changed

+23
-21
lines changed

3 files changed

+23
-21
lines changed

ci/scripts/cpp_test.sh

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -191,16 +191,19 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
191191
export ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1"
192192

193193
# 1. Generate seed corpuses
194+
# For IPC fuzz targets, these will include the golden IPC integration files.
194195
"${source_dir}/build-support/fuzzing/generate_corpuses.sh" "${binary_output_dir}"
195196

196197
# 2. Run fuzz targets on seed corpus entries
197198
function run_fuzz_target_on_seed_corpus() {
198199
fuzz_target_basename=$1
199200
corpus_dir=${binary_output_dir}/${fuzz_target_basename}_seed_corpus
200201
mkdir -p "${corpus_dir}"
201-
rm -f "${corpus_dir}"/*
202-
unzip "${binary_output_dir}"/"${fuzz_target_basename}"_seed_corpus.zip -d "${corpus_dir}"
203-
"${binary_output_dir}"/"${fuzz_target_basename}" -rss_limit_mb=4000 "${corpus_dir}"/*
202+
pushd "${corpus_dir}"
203+
unzip -q "${binary_output_dir}"/"${fuzz_target_basename}"_seed_corpus.zip -d .
204+
"${binary_output_dir}"/"${fuzz_target_basename}" -rss_limit_mb=4000 ./*
205+
popd
206+
rm -rf "${corpus_dir}"
204207
}
205208
run_fuzz_target_on_seed_corpus arrow-csv-fuzz
206209
run_fuzz_target_on_seed_corpus arrow-ipc-file-fuzz
@@ -212,22 +215,17 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
212215
fi
213216

214217
# 3. Run fuzz targets on regression files from arrow-testing
215-
# Run golden IPC integration files: these should ideally load without errors,
216-
# though some very old ones carry invalid data (such as decimal values
217-
# larger than their advertised precision).
218-
# shellcheck disable=SC2046
219-
"${binary_output_dir}/arrow-ipc-stream-fuzz" $(find "${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.stream")
220-
# shellcheck disable=SC2046
221-
"${binary_output_dir}/arrow-ipc-file-fuzz" $(find "${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.arrow_file")
222-
# Run known crash files
223-
"${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/crash-*
224-
"${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/*-testcase-*
225-
"${binary_output_dir}/arrow-ipc-file-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-file/*-testcase-*
226-
"${binary_output_dir}/arrow-ipc-tensor-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-tensor-stream/*-testcase-*
218+
pushd "${ARROW_TEST_DATA}"
219+
"${binary_output_dir}/arrow-ipc-stream-fuzz" arrow-ipc-stream/crash-*
220+
"${binary_output_dir}/arrow-ipc-stream-fuzz" arrow-ipc-stream/*-testcase-*
221+
"${binary_output_dir}/arrow-ipc-file-fuzz" arrow-ipc-file/*-testcase-*
222+
"${binary_output_dir}/arrow-ipc-tensor-stream-fuzz" arrow-ipc-tensor-stream/*-testcase-*
227223
if [ "${ARROW_PARQUET}" == "ON" ]; then
228-
"${binary_output_dir}/parquet-arrow-fuzz" "${ARROW_TEST_DATA}"/parquet/fuzzing/*-testcase-*
224+
"${binary_output_dir}/parquet-arrow-fuzz" parquet/fuzzing/*-testcase-*
225+
# TODO replay encoding regression files when we have some
229226
fi
230-
"${binary_output_dir}/arrow-csv-fuzz" "${ARROW_TEST_DATA}"/csv/fuzzing/*-testcase-*
227+
"${binary_output_dir}/arrow-csv-fuzz" csv/fuzzing/*-testcase-*
228+
popd
231229
fi
232230

233231
popd

cpp/build-support/fuzzing/generate_corpuses.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ set -ex
2929
CORPUS_DIR=/tmp/corpus
3030
PANDAS_DIR=/tmp/pandas
3131

32-
ARROW_ROOT=$(cd $(dirname $BASH_SOURCE)/../../..; pwd)
32+
ARROW_ROOT=$(cd $(dirname "$BASH_SOURCE")/../../..; pwd)
3333
ARROW_CPP=$ARROW_ROOT/cpp
3434
OUT=$1
3535

@@ -39,17 +39,21 @@ OUT=$1
3939

4040
# Arrow IPC
4141

42-
IPC_INTEGRATION_FILES=$(find ${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.stream")
43-
4442
rm -rf ${CORPUS_DIR}
4543
${OUT}/arrow-ipc-generate-fuzz-corpus -stream ${CORPUS_DIR}
44+
# Add "golden" IPC integration files
45+
IPC_INTEGRATION_FILES=$(find ${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.stream")
46+
[ -z "${IPC_INTEGRATION_FILES}" ] && exit 1
4647
# Several IPC integration files can have the same name, make sure
4748
# they all appear in the corpus by numbering the duplicates.
4849
cp --backup=numbered ${IPC_INTEGRATION_FILES} ${CORPUS_DIR}
4950
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-stream-fuzz_seed_corpus.zip
5051

5152
rm -rf ${CORPUS_DIR}
5253
${OUT}/arrow-ipc-generate-fuzz-corpus -file ${CORPUS_DIR}
54+
IPC_INTEGRATION_FILES=$(find ${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.arrow_file")
55+
[ -z "${IPC_INTEGRATION_FILES}" ] && exit 1
56+
cp --backup=numbered ${IPC_INTEGRATION_FILES} ${CORPUS_DIR}
5357
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-file-fuzz_seed_corpus.zip
5458

5559
rm -rf ${CORPUS_DIR}

cpp/src/parquet/arrow/fuzz_encoding_internal.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,7 @@ Status FuzzEncoding(const uint8_t* data, int64_t size) {
476476

477477
BEGIN_PARQUET_CATCH_EXCEPTIONS
478478

479-
auto typed_fuzz = [&](auto* dtype) {
479+
auto typed_fuzz = [header, descr, encoded_data](auto* dtype) {
480480
using DType = std::decay_t<decltype(*dtype)>;
481481
TypedFuzzEncoding<DType> typed_fuzz{header.source_encoding, header.roundtrip_encoding,
482482
&descr, header.num_values, encoded_data};

0 commit comments

Comments
 (0)