Skip to content

Commit d52f177

Browse files
Peformance/CMake Improvements (vincentlaucsb#27)
Parser is now capable of just over 200 MB/sec from disk and 240 MB/sec from memory on an Intel Core i7-8550U CPU
1 parent b5b4a72 commit d52f177

File tree

7 files changed

+238
-187
lines changed

7 files changed

+238
-187
lines changed

CMakeLists.txt

Lines changed: 7 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -19,67 +19,17 @@ else()
1919
set(CMAKE_CXX_FLAGS_DEBUG "-Og -g -lgcov --coverage")
2020
endif(MSVC)
2121

22-
message("CSV for C++ ${CMAKE_BUILD_TYPE} Build with ${CMAKE_CXX_COMPILER}")
22+
set(CSV_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include/)
23+
set(CSV_SOURCE_DIR ${CSV_INCLUDE_DIR}/internal/)
24+
set(CSV_TEST_DIR ${CMAKE_CURRENT_LIST_DIR}/tests)
2325

24-
set(SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/include/internal/)
25-
set(TEST_DIR ${CMAKE_CURRENT_LIST_DIR}/tests)
26-
27-
# file(GLOB_RECURSE SOURCES include/ *.hpp *.cpp)
28-
set(SOURCES
29-
${SOURCE_DIR}/csv_reader.cpp
30-
${SOURCE_DIR}/csv_reader_iterator.cpp
31-
${SOURCE_DIR}/csv_row.cpp
32-
${SOURCE_DIR}/csv_stat.cpp
33-
${SOURCE_DIR}/csv_utility.cpp
34-
${SOURCE_DIR}/data_type.cpp
35-
${SOURCE_DIR}/giant_string_buffer.cpp
36-
)
37-
set(TEST_SOURCES
38-
${TEST_DIR}/catch.hpp
39-
${TEST_DIR}/main.cpp
40-
${TEST_DIR}/test_csv_iterator.cpp
41-
${TEST_DIR}/test_csv_buffer.cpp
42-
${TEST_DIR}/test_csv_row.cpp
43-
${TEST_DIR}/test_csv_stat.cpp
44-
${TEST_DIR}/test_read_csv.cpp
45-
${TEST_DIR}/test_write_csv.cpp
46-
${TEST_DIR}/test_data_type.cpp
47-
)
48-
49-
include_directories(${CMAKE_CURRENT_LIST_DIR}/include/)
50-
include_directories(${TEST_DIR})
26+
include_directories(${CSV_INCLUDE_DIR})
5127

5228
## Main Library
53-
add_library(csv STATIC ${SOURCES})
54-
set_target_properties(csv PROPERTIES LINKER_LANGUAGE CXX)
29+
add_subdirectory(${CSV_SOURCE_DIR})
5530

5631
## Executables
57-
add_executable(csv_info ${CMAKE_CURRENT_LIST_DIR}/programs/csv_info.cpp)
58-
target_link_libraries(csv_info csv)
59-
60-
add_executable(csv_bench ${CMAKE_CURRENT_LIST_DIR}/programs/csv_bench.cpp)
61-
target_link_libraries(csv_bench csv)
62-
63-
add_executable(csv_guess_bench ${CMAKE_CURRENT_LIST_DIR}/programs/csv_guess_bench.cpp)
64-
target_link_libraries(csv_guess_bench csv)
65-
66-
add_executable(csv_stats ${CMAKE_CURRENT_LIST_DIR}/programs/csv_stats.cpp)
67-
target_link_libraries(csv_stats csv)
68-
69-
add_executable(csv_generator ${CMAKE_CURRENT_LIST_DIR}/programs/csv_generator.cpp)
70-
target_link_libraries(csv_generator csv)
71-
72-
add_executable(data_type_bench ${CMAKE_CURRENT_LIST_DIR}/programs/data_type_bench.cpp)
73-
target_link_libraries(data_type_bench csv)
32+
add_subdirectory("programs")
7433

7534
## Tests
76-
add_executable(csv_test ${TEST_SOURCES})
77-
target_link_libraries(csv_test csv)
78-
add_custom_command(
79-
TARGET csv_test POST_BUILD
80-
COMMAND ${CMAKE_COMMAND} -E copy_directory
81-
${TEST_DIR}/data $<TARGET_FILE_DIR:csv_test>/tests/data
82-
)
83-
84-
enable_testing()
85-
add_test(test csv_test)
35+
add_subdirectory("tests")

include/internal/CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
add_library(csv STATIC "")
2+
3+
target_sources(csv
4+
PRIVATE
5+
csv_reader.cpp
6+
csv_reader_iterator.cpp
7+
csv_row.cpp
8+
csv_stat.cpp
9+
csv_utility.cpp
10+
data_type.cpp
11+
giant_string_buffer.cpp
12+
)
13+
14+
set_target_properties(csv PROPERTIES LINKER_LANGUAGE CXX)

include/internal/csv_reader.cpp

Lines changed: 63 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -267,8 +267,8 @@ namespace csv {
267267
return CSV_NOT_FOUND;
268268
}
269269

270-
void CSVReader::feed(std::unique_ptr<char[]>&& buff) {
271-
this->feed(csv::string_view(buff.get()));
270+
void CSVReader::feed(WorkItem&& buff) {
271+
this->feed( csv::string_view(buff.first.get(), buff.second) );
272272
}
273273

274274
void CSVReader::feed(csv::string_view in) {
@@ -296,56 +296,72 @@ namespace csv {
296296
this->record_buffer->reserve(in.size());
297297
std::string& _record_buffer = *(this->record_buffer.get());
298298

299-
for (size_t i = 0; i < in.size(); i++) {
300-
if (!quote_escape) {
301-
switch (this->parse_flags[in[i] + 128]) {
302-
case NOT_SPECIAL:
303-
_record_buffer +=in[i];
304-
break;
299+
const size_t in_size = in.size();
300+
for (size_t i = 0; i < in_size; i++) {
301+
switch (this->parse_flags[in[i] + 128]) {
305302
case DELIMITER:
306-
this->split_buffer.push_back(this->record_buffer.size());
307-
break;
303+
if (!quote_escape) {
304+
this->split_buffer.push_back(this->record_buffer.size());
305+
break;
306+
}
308307
case NEWLINE:
309-
// End of record -> Write record
310-
if (i + 1 < in.size() && in[i + 1] == '\n') // Catches CRLF (or LFLF)
311-
++i;
312-
this->write_record();
313-
break;
314-
default: // Quote
315-
// Case: Previous character was delimiter or newline
316-
if (i) { // Don't deref past beginning
317-
auto prev_ch = this->parse_flags[in[i - 1] + 128];
318-
if (prev_ch >= DELIMITER) quote_escape = true;
308+
if (!quote_escape) {
309+
// End of record -> Write record
310+
if (i + 1 < in_size && in[i + 1] == '\n') // Catches CRLF (or LFLF)
311+
++i;
312+
this->write_record();
313+
break;
319314
}
315+
case NOT_SPECIAL: {
316+
// Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
317+
// sequences, use the loop below to avoid having to go through the outer
318+
// switch statement as much as possible
319+
#if __cplusplus >= 201703L
320+
size_t start = i;
321+
while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) {
322+
i++;
323+
}
324+
325+
_record_buffer += in.substr(start, i - start + 1);
326+
#else
327+
_record_buffer += in[i];
328+
329+
while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) {
330+
_record_buffer += in[++i];
331+
}
332+
#endif
333+
320334
break;
321335
}
322-
}
323-
else {
324-
switch (this->parse_flags[in[i] + 128]) {
325-
case NOT_SPECIAL:
326-
case DELIMITER:
327-
case NEWLINE:
328-
// Treat as a regular character
329-
_record_buffer +=in[i];
330-
break;
331336
default: // Quote
337+
if (!quote_escape) {
338+
// Don't deref past beginning
339+
if (i && this->parse_flags[in[i - 1] + 128] >= DELIMITER) {
340+
// Case: Previous character was delimiter or newline
341+
quote_escape = true;
342+
}
343+
344+
break;
345+
}
346+
332347
auto next_ch = this->parse_flags[in[i + 1] + 128];
333348
if (next_ch >= DELIMITER) {
334349
// Case: Delim or newline => end of field
335350
quote_escape = false;
351+
break;
336352
}
337-
else {
338-
// Case: Escaped quote
339-
_record_buffer +=in[i];
340-
341-
if (next_ch == QUOTE)
342-
++i; // Case: Two consecutive quotes
343-
else if (this->strict)
344-
throw std::runtime_error("Unescaped single quote around line " +
345-
std::to_string(this->correct_rows) + " near:\n" +
346-
std::string(in.substr(i, 100)));
347-
}
348-
}
353+
354+
// Case: Escaped quote
355+
_record_buffer += in[i];
356+
357+
if (next_ch == QUOTE)
358+
++i; // Case: Two consecutive quotes
359+
else if (this->strict)
360+
throw std::runtime_error("Unescaped single quote around line " +
361+
std::to_string(this->correct_rows) + " near:\n" +
362+
std::string(in.substr(i, 100)));
363+
364+
break;
349365
}
350366
}
351367

@@ -415,7 +431,7 @@ namespace csv {
415431
this->feed_buffer.pop_front();
416432

417433
// Nullptr --> Die
418-
if (!in) break;
434+
if (!in.first) break;
419435

420436
lock.unlock(); // Release lock
421437
this->feed(std::move(in));
@@ -455,11 +471,12 @@ namespace csv {
455471
char * result = std::fgets(line_buffer, internals::PAGE_SIZE, this->infile);
456472
if (result == NULL) break;
457473
line_buffer += std::strlen(line_buffer);
474+
size_t current_strlen = line_buffer - buffer.get();
458475

459-
if ((line_buffer - buffer.get()) >= 0.9 * BUFFER_UPPER_LIMIT) {
476+
if (current_strlen >= 0.9 * BUFFER_UPPER_LIMIT) {
460477
processed += (line_buffer - buffer.get());
461478
std::unique_lock<std::mutex> lock{ this->feed_lock };
462-
this->feed_buffer.push_back(std::move(buffer));
479+
this->feed_buffer.push_back(std::make_pair<>(std::move(buffer), current_strlen));
463480
this->feed_cond.notify_one();
464481

465482
buffer = std::unique_ptr<char[]>(new char[BUFFER_UPPER_LIMIT]); // New pointer
@@ -470,8 +487,8 @@ namespace csv {
470487

471488
// Feed remaining bits
472489
std::unique_lock<std::mutex> lock{ this->feed_lock };
473-
this->feed_buffer.push_back(std::move(buffer));
474-
this->feed_buffer.push_back(nullptr); // Termination signal
490+
this->feed_buffer.push_back(std::make_pair<>(std::move(buffer), line_buffer - buffer.get()));
491+
this->feed_buffer.push_back(std::make_pair<>(nullptr, 0)); // Termination signal
475492
this->feed_cond.notify_one();
476493
lock.unlock();
477494
worker.join();

include/internal/csv_reader.hpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,9 @@ namespace csv {
151151
NEWLINE
152152
};
153153

154+
using WorkItem = std::pair<std::unique_ptr<char[]>, size_t>; /**<
155+
@brief A string buffer and its size */
156+
154157
std::vector<CSVReader::ParseFlags> make_flags() const;
155158

156159
internals::GiantStringBuffer record_buffer; /**<
@@ -195,7 +198,7 @@ namespace csv {
195198

196199
/** @name Multi-Threaded File Reading Functions */
197200
///@{
198-
void feed(std::unique_ptr<char[]>&&); /**< @brief Helper for read_csv_worker() */
201+
void feed(WorkItem&&); /**< @brief Helper for read_csv_worker() */
199202
void read_csv(
200203
const std::string& filename,
201204
const size_t& bytes = internals::ITERATION_CHUNK_SIZE
@@ -208,8 +211,7 @@ namespace csv {
208211
std::FILE* infile = nullptr; /**< @brief Current file handle.
209212
Destroyed by ~CSVReader(). */
210213

211-
std::deque<std::unique_ptr<char[]>>
212-
feed_buffer; /**< @brief Message queue for worker */
214+
std::deque<WorkItem> feed_buffer; /**< @brief Message queue for worker */
213215

214216
std::mutex feed_lock; /**< @brief Allow only one worker to write */
215217
std::condition_variable feed_cond; /**< @brief Wake up worker */

programs/CMakeLists.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
add_executable(csv_info ${CMAKE_CURRENT_LIST_DIR}/csv_info.cpp)
2+
target_link_libraries(csv_info csv)
3+
4+
add_executable(csv_bench ${CMAKE_CURRENT_LIST_DIR}/csv_bench.cpp)
5+
target_link_libraries(csv_bench csv)
6+
7+
add_executable(csv_guess_bench ${CMAKE_CURRENT_LIST_DIR}/csv_guess_bench.cpp)
8+
target_link_libraries(csv_guess_bench csv)
9+
10+
add_executable(csv_stats ${CMAKE_CURRENT_LIST_DIR}/csv_stats.cpp)
11+
target_link_libraries(csv_stats csv)
12+
13+
add_executable(csv_generator ${CMAKE_CURRENT_LIST_DIR}/csv_generator.cpp)
14+
target_link_libraries(csv_generator csv)
15+
16+
add_executable(data_type_bench ${CMAKE_CURRENT_LIST_DIR}/data_type_bench.cpp)
17+
target_link_libraries(data_type_bench csv)

0 commit comments

Comments
 (0)