25
25
#include < cstdlib>
26
26
#include < algorithm>
27
27
#include < iostream>
28
- #include < map>
29
28
#include < set>
30
29
#include < vector>
31
30
#include " boost/algorithm/string/predicate.hpp"
@@ -125,6 +124,12 @@ void defineInputOptions(po::options_description& opts) {
125
124
" directory, then all the files and symbolic links to files in "
126
125
" the directory are treated as inputs. This option must be "
127
126
" specified at least once." );
127
+ input.add_options ()(" in.is-parquet" , po::bool_switch ()->default_value (false ),
128
+ " If true, input files are assumed to be parquet files." );
129
+ input.add_options ()(" in.parq2csv-quote" , po::bool_switch ()->default_value (false ),
130
+ " If true then put double quotes around valid fields when translating "
131
+ " the parquet files to CSV. This option is only valid when --in.is-parquet "
132
+ " is specified. Note this flag requires --out.csv.no-quote=false." );
128
133
opts.add (input);
129
134
}
130
135
@@ -141,7 +146,6 @@ InputLines const makeInputLines(ConfigStore const& config) {
141
146
" using --in.path." );
142
147
}
143
148
std::vector<fs::path> paths;
144
- bool bIsParquetFile = false ;
145
149
for (auto && s : config.get <std::vector<std::string>>(" in.path" )) {
146
150
fs::path p (s);
147
151
fs::file_status stat = fs::status (p);
@@ -154,32 +158,38 @@ InputLines const makeInputLines(ConfigStore const& config) {
154
158
}
155
159
}
156
160
}
157
- if (!bIsParquetFile)
158
- bIsParquetFile = (boost::algorithm::ends_with (s.c_str (), " .parquet" ) ||
159
- boost::algorithm::ends_with (s.c_str (), " .parq" ));
160
161
}
161
162
if (paths.empty ()) {
162
163
throw std::runtime_error (
163
164
" No non-empty input files found among the "
164
165
" files and directories specified via --in.path." );
165
166
}
166
-
167
- // return InputLines(paths, blockSize * MiB, false, names);
168
- if (!bIsParquetFile) return InputLines (paths, blockSize * MiB, false );
167
+ if (!config.flag (" in.is-parquet" )) return InputLines (paths, blockSize * MiB, false );
169
168
170
169
// In case input files are parquet files, data from config file have to be transfered to the parquet
171
170
// reading class Arrow : collect parameter name list to be read from parquet file
172
- std::vector<std::string> names;
173
- std::string st_null = " " ;
174
- std::string st_delimiter = " " ;
175
- std::string st_escape = " " ;
176
-
177
- if (config.has (" in.csv.field" )) names = config.get <std::vector<std::string>>(" in.csv.field" );
178
- if (config.has (" in.csv.null" )) st_null = config.get <std::string>(" in.csv.null" );
179
- if (config.has (" in.csv.delimiter" )) st_delimiter = config.get <std::string>(" in.csv.delimiter" );
180
- if (config.has (" in.csv.escape" )) st_escape = config.get <std::string>(" in.csv.escape" );
181
-
182
- ConfigParamArrow const configParamArrow{names, st_null, st_delimiter, st_escape};
171
+ std::vector<std::string> columns;
172
+ if (config.has (" in.csv.field" )) {
173
+ columns = config.get <std::vector<std::string>>(" in.csv.field" );
174
+ }
175
+ std::set<std::string> optional;
176
+ if (config.has (" in.csv.optional" )) {
177
+ for (auto const & column : config.get <std::vector<std::string>>(" in.csv.optional" )) {
178
+ optional.insert (column);
179
+ }
180
+ }
181
+ std::string const st_null =
182
+ config.has (" in.csv.null" ) ? config.get <std::string>(" in.csv.null" ) : std::string ();
183
+ std::string const st_delimiter =
184
+ config.has (" in.csv.delimiter" ) ? config.get <std::string>(" in.csv.delimiter" ) : std::string ();
185
+ std::string const st_escape =
186
+ config.has (" in.csv.escape" ) ? config.get <std::string>(" in.csv.escape" ) : std::string ();
187
+ bool const in_quote = config.flag (" in.parq2csv-quote" );
188
+ bool const out_no_quote = config.has (" out.csv.no-quote" ) ? config.get <bool >(" out.csv.no-quote" ) : false ;
189
+ if (in_quote && out_no_quote) {
190
+ throw std::runtime_error (" Option --in.parq2csv-quote=true requires --out.csv.no-quote=false" );
191
+ }
192
+ ConfigParamArrow const configParamArrow{columns, optional, st_null, st_delimiter, st_escape, in_quote};
183
193
184
194
// Direct parquet file reading is not possible using MT - March 2023
185
195
if (config.has (" mr.num-workers" ) && config.get <int >(" mr.num-workers" ) > 1 )
@@ -231,21 +241,21 @@ void ensureOutputFieldExists(ConfigStore& config, std::string const& opt) {
231
241
if (!config.has (opt)) {
232
242
return ;
233
243
}
234
- std::vector<std::string> names ;
244
+ std::vector<std::string> columns ;
235
245
if (!config.has (" out.csv.field" )) {
236
246
if (!config.has (" in.csv.field" )) {
237
- std::cerr << " Input CSV field names not specified." << std::endl;
247
+ std::cerr << " Input CSV column names not specified." << std::endl;
238
248
std::exit (EXIT_FAILURE);
239
249
}
240
- names = config.get <std::vector<std::string>>(" in.csv.field" );
250
+ columns = config.get <std::vector<std::string>>(" in.csv.field" );
241
251
} else {
242
- names = config.get <std::vector<std::string>>(" out.csv.field" );
252
+ columns = config.get <std::vector<std::string>>(" out.csv.field" );
243
253
}
244
- std::string const name = config.get <std::string>(opt);
245
- if (std::find (names .begin (), names .end (), name ) == names .end ()) {
246
- names .push_back (name );
254
+ std::string const column = config.get <std::string>(opt);
255
+ if (std::find (columns .begin (), columns .end (), column ) == columns .end ()) {
256
+ columns .push_back (column );
247
257
}
248
- config.set (" out.csv.field" , names );
258
+ config.set (" out.csv.field" , columns );
249
259
}
250
260
251
261
std::vector<int32_t > const chunksToDuplicate (Chunker const & chunker, ConfigStore const & config) {
0 commit comments