Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions include/ygm/io/csv_parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ namespace ygm::io {
/**
* @brief Class for parsing collections of CSV files in distributed memory
*/
class csv_parser : public ygm::container::detail::base_iteration_value<
csv_parser, std::tuple<std::vector<detail::csv_field>>> {
template <typename StringType = std::string>
class csv_parser
: public ygm::container::detail::base_iteration_value<
csv_parser<StringType>, std::tuple<std::vector<detail::csv_field>>> {
public:
using for_all_args = std::tuple<std::vector<detail::csv_field>>;

Expand All @@ -37,9 +39,9 @@ class csv_parser : public ygm::container::detail::base_iteration_value<
void for_all(Function fn) {
using namespace ygm::io::detail;

std::map<std::string, int>* header_map_ptr;
bool skip_first;
auto handle_line_lambda = [fn, this](const std::string& line) {
std::map<StringType, int>* header_map_ptr;
bool skip_first;
auto handle_line_lambda = [fn, this](const StringType& line) {
auto vfields = parse_csv_line(line, m_header_map);
// auto stypes = convert_type_string(vfields);
// todo, detect if types are inconsistent between records
Expand Down Expand Up @@ -67,7 +69,7 @@ class csv_parser : public ygm::container::detail::base_iteration_value<
*
* @param label Header label to search for within headers
*/
bool has_header(const std::string& label) {
bool has_header(const StringType& label) {
return m_has_headers && (m_header_map.find(label) != m_header_map.end());
}

Expand All @@ -76,9 +78,9 @@ class csv_parser : public ygm::container::detail::base_iteration_value<
const ygm::comm& comm() const { return m_lp.comm(); }

private:
line_parser m_lp;
line_parser<StringType> m_lp;

std::map<std::string, int> m_header_map;
bool m_has_headers;
std::map<StringType, int> m_header_map;
bool m_has_headers;
}; // namespace ygm::io
} // namespace ygm::io
48 changes: 35 additions & 13 deletions include/ygm/io/line_parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <filesystem>
#include <fstream>
#include <locale>
#include <string>
#include <vector>
#include <ygm/container/detail/base_iteration.hpp>
Expand All @@ -18,10 +19,12 @@ namespace fs = std::filesystem;
/**
* @brief Distributed text file parsing.
*/
template <typename StringType = std::string>
class line_parser : public ygm::container::detail::base_iteration_value<
line_parser, std::tuple<std::string>> {
line_parser<StringType>, std::tuple<StringType>> {
public:
using for_all_args = std::tuple<std::string>;
using for_all_args = std::tuple<StringType>;
using char_t = StringType::value_type; // Type for characters

private:
// enum for tracking storage accessiblity
Expand Down Expand Up @@ -175,25 +178,25 @@ class line_parser : public ygm::container::detail::base_iteration_value<
for (const auto& fname : my_file_paths) {
// m_comm.cout("Opening: ", std::get<0>(fname), " ", std::get<1>(fname),
// " ", std::get<2>(fname));
std::ifstream ifs(std::get<0>(fname));
std::basic_ifstream<char_t> ifs(std::get<0>(fname));
// Note: Current process is responsible for reading up to *AND
// INCLUDING* bytes_end
size_t bytes_begin = std::get<1>(fname);
size_t bytes_end = std::get<2>(fname);
YGM_ASSERT_RELEASE(ifs.good());
ifs.imbue(std::locale::classic());
std::string line;
bool first_line = false;
StringType line;
bool first_line = false;
// Throw away line containing bytes_begin as it was read by the previous
// process (unless it corresponds to the beginning of a file)
if (bytes_begin > 0) {
ifs.seekg(bytes_begin);
std::getline(ifs, line);
getline_impl(ifs, line);
} else {
first_line = true;
}
// Keep reading until line containing bytes_end is read
while (ifs.tellg() <= bytes_end && std::getline(ifs, line)) {
while (ifs.tellg() <= bytes_end && getline_impl(ifs, line)) {
// Check if last character is '\r'. This will happen if a file was
// edited on Windows and can cause issues for parsing
if (not line.empty() && (line.back() == 0x0D)) {
Expand All @@ -211,11 +214,11 @@ class line_parser : public ygm::container::detail::base_iteration_value<
my_file_paths.clear();
}

std::string read_first_line() {
std::string line;
StringType read_first_line() {
StringType line;
if (m_comm.rank0()) {
std::ifstream ifs(m_paths[0].first);
std::getline(ifs, line);
std::basic_ifstream<char_t> ifs(m_paths[0].first);
getline_impl(ifs, line);
}

line = m_comm.mpi_bcast(line, 0, m_comm.get_mpi_comm());
Expand Down Expand Up @@ -336,13 +339,32 @@ class line_parser : public ygm::container::detail::base_iteration_value<
* @return false
*/
bool is_file_good(const fs::path& p) {
std::ifstream ifs(p);
bool good = ifs.good();
std::basic_ifstream<char_t> ifs(p);
bool good = ifs.good();
if (!good) {
m_comm.cout("WARNING: unable to open: ", p);
}
return good;
}

/**
* @brief Execute getline that works with the StringType used
*
* @param input Stream to read from
* @param str String to hold line
* @return Input stream (same as in std::getline)
*/
std::basic_istream<typename StringType::value_type>& getline_impl(
std::basic_istream<typename StringType::value_type>& input,
StringType& str) {
if constexpr (std::is_same_v<typename StringType::value_type, char>) {
return std::getline(input, str);
} else if constexpr (std::is_same_v<typename StringType::value_type,
char32_t>) {
return std::getline(input, str, U'\n');
}
}

ygm::comm& m_comm;
std::vector<std::pair<fs::path, accessibility_tag>> m_paths;
bool m_skip_first_line;
Expand Down
2 changes: 1 addition & 1 deletion include/ygm/io/ndjson_parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ class ndjson_parser : public ygm::container::detail::base_iteration_value<
}

private:
line_parser m_lp;
line_parser<std::string> m_lp;

size_t m_num_invalid_records{0};
};
Expand Down
11 changes: 11 additions & 0 deletions test/data/utf8.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
flağ
t✓ag
bag
ceŀl
call
selľ
starţ
stars
falg
🬆
55 changes: 42 additions & 13 deletions test/test_line_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
namespace fs = std::filesystem;

void test_line_parser_files(ygm::comm&, const std::vector<std::string>&);
void test_line_parser_directory(ygm::comm& , const std::string& , size_t );
void test_line_parser_directory(ygm::comm&, const std::string&, size_t);
template <typename StringType>
void test_line_parser_unicode(ygm::comm&);

int main(int argc, char** argv) {
ygm::comm world(&argc, &argv);
Expand All @@ -22,14 +24,14 @@ int main(int argc, char** argv) {
test_line_parser_files(world, {"data/short.txt"});
test_line_parser_files(world, {"data/loremipsum/loremipsum_0.txt"});
test_line_parser_files(world, {"data/loremipsum/loremipsum_0.txt",
"data/loremipsum/loremipsum_1.txt"});
"data/loremipsum/loremipsum_1.txt"});
test_line_parser_files(world, {"data/loremipsum/loremipsum_0.txt",
"data/loremipsum/loremipsum_1.txt",
"data/loremipsum/loremipsum_2.txt"});
"data/loremipsum/loremipsum_1.txt",
"data/loremipsum/loremipsum_2.txt"});
test_line_parser_files(world, {"data/loremipsum/loremipsum_0.txt",
"data/loremipsum/loremipsum_1.txt",
"data/loremipsum/loremipsum_2.txt",
"data/loremipsum/loremipsum_3.txt"});
"data/loremipsum/loremipsum_1.txt",
"data/loremipsum/loremipsum_2.txt",
"data/loremipsum/loremipsum_3.txt"});
test_line_parser_files(
world,
{"data/loremipsum/loremipsum_0.txt", "data/loremipsum/loremipsum_1.txt",
Expand All @@ -46,10 +48,18 @@ int main(int argc, char** argv) {
test_line_parser_directory(world, "data/loremipsum/", 270);
}

{
test_line_parser_unicode<std::string>(world);
#ifndef __APPLE_CC__
test_line_parser_unicode<std::u32string>(world);
#endif
}

return 0;
}

void test_line_parser_files(ygm::comm& comm, const std::vector<std::string>& files) {
void test_line_parser_files(ygm::comm& comm,
const std::vector<std::string>& files) {
//
// Read in each line into a distributed set
ygm::container::counting_set<std::string> line_set_to_test(comm);
Expand All @@ -73,13 +83,13 @@ void test_line_parser_files(ygm::comm& comm, const std::vector<std::string>& fil
}

YGM_ASSERT_RELEASE(line_set.size() == line_set_sequential.size());
//comm.cout0(line_set.size(), " =? ", line_set_to_test.size());
// comm.cout0(line_set.size(), " =? ", line_set_to_test.size());
YGM_ASSERT_RELEASE(line_set.size() == line_set_to_test.size());
// YGM_ASSERT_RELEASE(line_set == line_set_to_test);
}


void test_line_parser_directory(ygm::comm& comm, const std::string& dir, size_t unique_line_count) {
void test_line_parser_directory(ygm::comm& comm, const std::string& dir,
size_t unique_line_count) {
//
// Read in each line into a distributed set
ygm::container::counting_set<std::string> line_set_to_test(comm);
Expand All @@ -88,5 +98,24 @@ void test_line_parser_directory(ygm::comm& comm, const std::string& dir, size_t
line_set_to_test.async_insert(line);
});

YGM_ASSERT_RELEASE(unique_line_count == line_set_to_test.size());
}
YGM_ASSERT_RELEASE(unique_line_count == line_set_to_test.size());
}

template <typename StringType>
void test_line_parser_unicode(ygm::comm& comm) {
std::array<size_t, 11> line_lengths;
if constexpr (std::is_same_v<typename StringType::value_type, char>) {
line_lengths = {5, 6, 3, 5, 4, 5, 6, 5, 4, 3, 4};
} else if constexpr (std::is_same_v<typename StringType::value_type,
char32_t>) {
line_lengths = {4, 4, 3, 4, 4, 4, 5, 5, 4, 1, 1};
}

ygm::io::line_parser<StringType> utf8_parser(comm, {"data/utf8.txt"});

size_t line_num{0};
utf8_parser.for_all([&line_lengths, &line_num](const auto& line) {
YGM_ASSERT_RELEASE(line.size() == line_lengths[line_num]);
++line_num;
});
}