Skip to content

Commit 902ca15

Browse files
committed
Address review feedback: rename to FlushToSink and consolidate Python tests
- Rename WriteAndClearBuffer to FlushToSink (shorter, clearer intent) - Consolidate Python tests into a single parameterized test with 3 cases: empty batch at beginning, middle, and end
1 parent 2a16d2a commit 902ca15

File tree

2 files changed

+31
-30
lines changed

2 files changed

+31
-30
lines changed

cpp/src/arrow/csv/writer.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -541,7 +541,7 @@ class CSVWriterImpl : public ipc::RecordBatchWriter {
541541
for (auto maybe_slice : iterator) {
542542
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<RecordBatch> slice, maybe_slice);
543543
RETURN_NOT_OK(TranslateMinimalBatch(*slice));
544-
RETURN_NOT_OK(WriteAndClearBuffer());
544+
RETURN_NOT_OK(FlushToSink());
545545
stats_.num_record_batches++;
546546
}
547547
return Status::OK();
@@ -554,7 +554,7 @@ class CSVWriterImpl : public ipc::RecordBatchWriter {
554554
RETURN_NOT_OK(reader.ReadNext(&batch));
555555
while (batch != nullptr) {
556556
RETURN_NOT_OK(TranslateMinimalBatch(*batch));
557-
RETURN_NOT_OK(WriteAndClearBuffer());
557+
RETURN_NOT_OK(FlushToSink());
558558
RETURN_NOT_OK(reader.ReadNext(&batch));
559559
stats_.num_record_batches++;
560560
}
@@ -590,9 +590,9 @@ class CSVWriterImpl : public ipc::RecordBatchWriter {
590590
return Status::OK();
591591
}
592592

593-
// GH-36889: Write buffer to sink and clear it to avoid stale content
593+
// GH-36889: Flush buffer to sink and clear it to avoid stale content
594594
// being written again if the next batch is empty.
595-
Status WriteAndClearBuffer() {
595+
Status FlushToSink() {
596596
RETURN_NOT_OK(sink_->Write(data_buffer_));
597597
return data_buffer_->Resize(0, /*shrink_to_fit=*/false);
598598
}
@@ -661,7 +661,7 @@ class CSVWriterImpl : public ipc::RecordBatchWriter {
661661
next += options_.eol.size();
662662
DCHECK_EQ(reinterpret_cast<uint8_t*>(next),
663663
data_buffer_->data() + data_buffer_->size());
664-
return WriteAndClearBuffer();
664+
return FlushToSink();
665665
}
666666

667667
Status TranslateMinimalBatch(const RecordBatch& batch) {

python/pyarrow/tests/test_csv.py

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2067,34 +2067,35 @@ def readinto(self, *args):
20672067
read_csv(MyBytesIO(data))
20682068

20692069

2070-
def test_write_csv_empty_batch_no_duplicate_header():
2071-
# GH-36889: Empty batches at the start should not cause duplicate headers
2072-
table = pa.table({"col1": ["a", "b", "c"]})
2073-
2074-
# Concatenate empty table with data table
2075-
empty_table = table.schema.empty_table()
2076-
combined = pa.concat_tables([empty_table, table])
2077-
2078-
buf = io.BytesIO()
2079-
write_csv(combined, buf)
2080-
buf.seek(0)
2081-
result = buf.read()
2082-
2083-
# Should have exactly one header, not two
2084-
assert result == b'"col1"\n"a"\n"b"\n"c"\n'
2085-
2086-
2087-
def test_write_csv_empty_batch_in_middle():
2088-
# GH-36889: Empty batches in the middle should not cause issues
2089-
table1 = pa.table({"col1": ["a"]})
2090-
table2 = pa.table({"col1": ["b"]})
2091-
empty_table = table1.schema.empty_table()
2092-
2093-
combined = pa.concat_tables([table1, empty_table, table2])
2070+
@pytest.mark.parametrize("tables,expected", [
2071+
# GH-36889: Empty batch at the beginning
2072+
(
2073+
lambda: [pa.table({"col1": []}).cast(pa.schema([("col1", pa.string())])),
2074+
pa.table({"col1": ["a"]}),
2075+
pa.table({"col1": ["b"]})],
2076+
b'"col1"\n"a"\n"b"\n'
2077+
),
2078+
# GH-36889: Empty batch in the middle
2079+
(
2080+
lambda: [pa.table({"col1": ["a"]}),
2081+
pa.table({"col1": []}).cast(pa.schema([("col1", pa.string())])),
2082+
pa.table({"col1": ["b"]})],
2083+
b'"col1"\n"a"\n"b"\n'
2084+
),
2085+
# GH-36889: Empty batch at the end
2086+
(
2087+
lambda: [pa.table({"col1": ["a"]}),
2088+
pa.table({"col1": ["b"]}),
2089+
pa.table({"col1": []}).cast(pa.schema([("col1", pa.string())]))],
2090+
b'"col1"\n"a"\n"b"\n'
2091+
),
2092+
])
2093+
def test_write_csv_empty_batch_should_not_pollute_output(tables, expected):
2094+
combined = pa.concat_tables(tables())
20942095

20952096
buf = io.BytesIO()
20962097
write_csv(combined, buf)
20972098
buf.seek(0)
20982099
result = buf.read()
20992100

2100-
assert result == b'"col1"\n"a"\n"b"\n'
2101+
assert result == expected

0 commit comments

Comments
 (0)