diff --git a/include/ygm/io/csv_parser.hpp b/include/ygm/io/csv_parser.hpp index 3bf64369..47bb8b64 100644 --- a/include/ygm/io/csv_parser.hpp +++ b/include/ygm/io/csv_parser.hpp @@ -18,8 +18,10 @@ namespace ygm::io { /** * @brief Class for parsing collections of CSV files in distributed memory */ -class csv_parser : public ygm::container::detail::base_iteration_value< - csv_parser, std::tuple>> { +template +class csv_parser + : public ygm::container::detail::base_iteration_value< + csv_parser, std::tuple>> { public: using for_all_args = std::tuple>; @@ -37,9 +39,9 @@ class csv_parser : public ygm::container::detail::base_iteration_value< void for_all(Function fn) { using namespace ygm::io::detail; - std::map* header_map_ptr; - bool skip_first; - auto handle_line_lambda = [fn, this](const std::string& line) { + std::map* header_map_ptr; + bool skip_first; + auto handle_line_lambda = [fn, this](const StringType& line) { auto vfields = parse_csv_line(line, m_header_map); // auto stypes = convert_type_string(vfields); // todo, detect if types are inconsistent between records @@ -67,7 +69,7 @@ class csv_parser : public ygm::container::detail::base_iteration_value< * * @param label Header label to search for within headers */ - bool has_header(const std::string& label) { + bool has_header(const StringType& label) { return m_has_headers && (m_header_map.find(label) != m_header_map.end()); } @@ -76,9 +78,9 @@ class csv_parser : public ygm::container::detail::base_iteration_value< const ygm::comm& comm() const { return m_lp.comm(); } private: - line_parser m_lp; + line_parser m_lp; - std::map m_header_map; - bool m_has_headers; + std::map m_header_map; + bool m_has_headers; }; // namespace ygm::io } // namespace ygm::io diff --git a/include/ygm/io/line_parser.hpp b/include/ygm/io/line_parser.hpp index c4ae3632..ad69b738 100644 --- a/include/ygm/io/line_parser.hpp +++ b/include/ygm/io/line_parser.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -18,10 +19,12 @@ namespace fs = std::filesystem; /** * @brief Distributed text file parsing. */ +template class line_parser : public ygm::container::detail::base_iteration_value< - line_parser, std::tuple> { + line_parser, std::tuple> { public: - using for_all_args = std::tuple; + using for_all_args = std::tuple; + using char_t = StringType::value_type; // Type for characters private: // enum for tracking storage accessiblity @@ -175,25 +178,25 @@ class line_parser : public ygm::container::detail::base_iteration_value< for (const auto& fname : my_file_paths) { // m_comm.cout("Opening: ", std::get<0>(fname), " ", std::get<1>(fname), // " ", std::get<2>(fname)); - std::ifstream ifs(std::get<0>(fname)); + std::basic_ifstream ifs(std::get<0>(fname)); // Note: Current process is responsible for reading up to *AND // INCLUDING* bytes_end size_t bytes_begin = std::get<1>(fname); size_t bytes_end = std::get<2>(fname); YGM_ASSERT_RELEASE(ifs.good()); ifs.imbue(std::locale::classic()); - std::string line; - bool first_line = false; + StringType line; + bool first_line = false; // Throw away line containing bytes_begin as it was read by the previous // process (unless it corresponds to the beginning of a file) if (bytes_begin > 0) { ifs.seekg(bytes_begin); - std::getline(ifs, line); + getline_impl(ifs, line); } else { first_line = true; } // Keep reading until line containing bytes_end is read - while (ifs.tellg() <= bytes_end && std::getline(ifs, line)) { + while (ifs.tellg() <= bytes_end && getline_impl(ifs, line)) { // Check if last character is '\r'. This will happen if a file was // edited on Windows and can cause issues for parsing if (not line.empty() && (line.back() == 0x0D)) { @@ -211,11 +214,11 @@ class line_parser : public ygm::container::detail::base_iteration_value< my_file_paths.clear(); } - std::string read_first_line() { - std::string line; + StringType read_first_line() { + StringType line; if (m_comm.rank0()) { - std::ifstream ifs(m_paths[0].first); - std::getline(ifs, line); + std::basic_ifstream ifs(m_paths[0].first); + getline_impl(ifs, line); } line = m_comm.mpi_bcast(line, 0, m_comm.get_mpi_comm()); @@ -336,13 +339,32 @@ class line_parser : public ygm::container::detail::base_iteration_value< * @return false */ bool is_file_good(const fs::path& p) { - std::ifstream ifs(p); - bool good = ifs.good(); + std::basic_ifstream ifs(p); + bool good = ifs.good(); if (!good) { m_comm.cout("WARNING: unable to open: ", p); } return good; } + + /** + * @brief Execute getline that works with the StringType used + * + * @param input Stream to read from + * @param str String to hold line + * @return Input stream (same as in std::getline) + */ + std::basic_istream& getline_impl( + std::basic_istream& input, + StringType& str) { + if constexpr (std::is_same_v) { + return std::getline(input, str); + } else if constexpr (std::is_same_v) { + return std::getline(input, str, U'\n'); + } + } + ygm::comm& m_comm; std::vector> m_paths; bool m_skip_first_line; diff --git a/include/ygm/io/ndjson_parser.hpp b/include/ygm/io/ndjson_parser.hpp index aad065d4..762efe67 100644 --- a/include/ygm/io/ndjson_parser.hpp +++ b/include/ygm/io/ndjson_parser.hpp @@ -106,7 +106,7 @@ class ndjson_parser : public ygm::container::detail::base_iteration_value< } private: - line_parser m_lp; + line_parser m_lp; size_t m_num_invalid_records{0}; }; diff --git a/test/data/utf8.txt b/test/data/utf8.txt new file mode 100644 index 00000000..e1644394 --- /dev/null +++ b/test/data/utf8.txt @@ -0,0 +1,11 @@ +flağ +t✓ag +bag +ceŀl +call +selľ +starţ +stars +falg +♘ +🬆 diff --git a/test/test_line_parser.cpp b/test/test_line_parser.cpp index 1223da5c..f15c9795 100644 --- a/test/test_line_parser.cpp +++ b/test/test_line_parser.cpp @@ -13,7 +13,9 @@ namespace fs = std::filesystem; void test_line_parser_files(ygm::comm&, const std::vector&); -void test_line_parser_directory(ygm::comm& , const std::string& , size_t ); +void test_line_parser_directory(ygm::comm&, const std::string&, size_t); +template +void test_line_parser_unicode(ygm::comm&); int main(int argc, char** argv) { ygm::comm world(&argc, &argv); @@ -22,14 +24,14 @@ int main(int argc, char** argv) { test_line_parser_files(world, {"data/short.txt"}); test_line_parser_files(world, {"data/loremipsum/loremipsum_0.txt"}); test_line_parser_files(world, {"data/loremipsum/loremipsum_0.txt", - "data/loremipsum/loremipsum_1.txt"}); + "data/loremipsum/loremipsum_1.txt"}); test_line_parser_files(world, {"data/loremipsum/loremipsum_0.txt", - "data/loremipsum/loremipsum_1.txt", - "data/loremipsum/loremipsum_2.txt"}); + "data/loremipsum/loremipsum_1.txt", + "data/loremipsum/loremipsum_2.txt"}); test_line_parser_files(world, {"data/loremipsum/loremipsum_0.txt", - "data/loremipsum/loremipsum_1.txt", - "data/loremipsum/loremipsum_2.txt", - "data/loremipsum/loremipsum_3.txt"}); + "data/loremipsum/loremipsum_1.txt", + "data/loremipsum/loremipsum_2.txt", + "data/loremipsum/loremipsum_3.txt"}); test_line_parser_files( world, {"data/loremipsum/loremipsum_0.txt", "data/loremipsum/loremipsum_1.txt", @@ -46,10 +48,18 @@ int main(int argc, char** argv) { test_line_parser_directory(world, "data/loremipsum/", 270); } + { + test_line_parser_unicode(world); +#ifndef __APPLE_CC__ + test_line_parser_unicode(world); +#endif + } + return 0; } -void test_line_parser_files(ygm::comm& comm, const std::vector& files) { +void test_line_parser_files(ygm::comm& comm, + const std::vector& files) { // // Read in each line into a distributed set ygm::container::counting_set line_set_to_test(comm); @@ -73,13 +83,13 @@ void test_line_parser_files(ygm::comm& comm, const std::vector& fil } YGM_ASSERT_RELEASE(line_set.size() == line_set_sequential.size()); - //comm.cout0(line_set.size(), " =? ", line_set_to_test.size()); + // comm.cout0(line_set.size(), " =? ", line_set_to_test.size()); YGM_ASSERT_RELEASE(line_set.size() == line_set_to_test.size()); // YGM_ASSERT_RELEASE(line_set == line_set_to_test); } - -void test_line_parser_directory(ygm::comm& comm, const std::string& dir, size_t unique_line_count) { +void test_line_parser_directory(ygm::comm& comm, const std::string& dir, + size_t unique_line_count) { // // Read in each line into a distributed set ygm::container::counting_set line_set_to_test(comm); @@ -88,5 +98,24 @@ void test_line_parser_directory(ygm::comm& comm, const std::string& dir, size_t line_set_to_test.async_insert(line); }); - YGM_ASSERT_RELEASE(unique_line_count == line_set_to_test.size()); -} \ No newline at end of file + YGM_ASSERT_RELEASE(unique_line_count == line_set_to_test.size()); +} + +template +void test_line_parser_unicode(ygm::comm& comm) { + std::array line_lengths; + if constexpr (std::is_same_v) { + line_lengths = {5, 6, 3, 5, 4, 5, 6, 5, 4, 3, 4}; + } else if constexpr (std::is_same_v) { + line_lengths = {4, 4, 3, 4, 4, 4, 5, 5, 4, 1, 1}; + } + + ygm::io::line_parser utf8_parser(comm, {"data/utf8.txt"}); + + size_t line_num{0}; + utf8_parser.for_all([&line_lengths, &line_num](const auto& line) { + YGM_ASSERT_RELEASE(line.size() == line_lengths[line_num]); + ++line_num; + }); +}