22
22
#include < iomanip>
23
23
#include < iostream>
24
24
#include < fstream>
25
- #include < map>
26
25
#include < memory>
26
+ #include < set>
27
27
#include < stdexcept>
28
28
#include < string>
29
29
#include < vector>
30
30
31
+ // Third party headers
31
32
#include " boost/program_options.hpp"
33
+ #include " nlohmann/json.hpp"
32
34
33
35
#include " partition/ParquetInterface.h"
34
36
35
37
namespace po = boost::program_options;
36
38
namespace part = lsst::partition;
37
39
38
40
using namespace std ;
41
+ using json = nlohmann::json;
39
42
40
43
namespace {
41
44
@@ -60,19 +63,20 @@ class CommandLineParams {
60
63
po::options_description desc (help, 80 );
61
64
desc.add_options ()(" help,h" , " Produce this help" );
62
65
desc.add_options ()(" verbose,v" , " Produce verbose output." );
66
+ desc.add_options ()(" csv-quote-fields" , " Double quote fields as needed in the generated CSV." );
63
67
desc.add_options ()(" max-proc-mem-mb" , po::value<int >()->default_value (maxMemAllocatedMB),
64
68
" Max size (MB) of RAM allocated to the process." );
65
69
desc.add_options ()(" buf-size-mb" , po::value<int >()->default_value (maxBuffSizeMB),
66
70
" Buffers size (MB) for translating batches." );
67
- desc.add_options ()(" parq-file " , po::value<vector<string>>(), " Input file to be translated." );
68
- desc.add_options ()(" coldef-file " , po::value<vector<string>>(),
69
- " Input file with the names of columns to be extracted." );
70
- desc.add_options ()(" csv-file " , po::value<vector<string>>(), " Output file to be written." );
71
+ desc.add_options ()(" parq" , po::value<vector<string>>(), " Input file to be translated." );
72
+ desc.add_options ()(" config " , po::value<vector<string>>(),
73
+ " Input JSON file with definition of columns to be extracted." );
74
+ desc.add_options ()(" csv" , po::value<vector<string>>(), " Output file to be written." );
71
75
72
76
po::positional_options_description p;
73
- p.add (" parq-file " , 1 );
74
- p.add (" coldef-file " , 1 );
75
- p.add (" csv-file " , 1 );
77
+ p.add (" parq" , 1 );
78
+ p.add (" config " , 1 );
79
+ p.add (" csv" , 1 );
76
80
77
81
po::variables_map vm;
78
82
po::store (po::command_line_parser (argc, argv).options (desc).positional (p).run (), vm);
@@ -82,17 +86,17 @@ class CommandLineParams {
82
86
cout << desc << " \n " ;
83
87
return false ;
84
88
}
85
- parqFileName = vm.count (" parq-file " ) ? vm[" parq-file " ].as <vector<string>>().front () : string ();
86
- coldefFileName = vm.count (" coldef-file " ) ? vm[" coldef-file " ].as <vector<string>>().front () : string ();
87
- csvFileName = vm.count (" csv-file " ) ? vm[" csv-file " ].as <vector<string>>().front () : string ();
89
+ parqFileName = vm.count (" parq" ) ? vm[" parq" ].as <vector<string>>().front () : string ();
90
+ configFileName = vm.count (" config " ) ? vm[" config " ].as <vector<string>>().front () : string ();
91
+ csvFileName = vm.count (" csv" ) ? vm[" csv" ].as <vector<string>>().front () : string ();
88
92
89
- if (parqFileName.empty () || coldefFileName .empty () || csvFileName.empty ()) {
93
+ if (parqFileName.empty () || configFileName .empty () || csvFileName.empty ()) {
90
94
throw runtime_error (" The names of all required files must be provided." );
91
95
}
92
96
if (csvFileName == parqFileName) {
93
97
throw runtime_error (" Input and output file names must be different." );
94
98
}
95
- _parseColdDefFile ();
99
+ _parseConfigFile ();
96
100
97
101
if (vm.count (" max-mem-alloc-mb" )) {
98
102
maxMemAllocatedMB = vm[" max-mem-alloc-mb" ].as <int >();
@@ -107,18 +111,19 @@ class CommandLineParams {
107
111
throw runtime_error (" Buffer size (MB) must be in a range of [1,1024]." );
108
112
}
109
113
verbose = vm.count (" verbose" ) != 0 ;
114
+ quote = vm.count (" csv-quote-fields" ) != 0 ;
110
115
111
116
return true ;
112
117
}
113
118
114
119
// Values of the parsed parameters aere stored in the data members defined below.
115
120
116
121
string parqFileName;
117
- string coldefFileName ;
122
+ string configFileName ;
118
123
string csvFileName;
119
124
120
125
vector<string> columns;
121
- map <string, string> optionalColumnDefs ;
126
+ set <string> optionalColumns ;
122
127
123
128
int maxMemAllocatedMB = 3000 ;
124
129
int maxBuffSizeMB = 16 ;
@@ -127,21 +132,47 @@ class CommandLineParams {
127
132
string const delimStr = " \t " ;
128
133
129
134
bool verbose = false ;
135
+ bool quote = false ;
130
136
131
137
private:
132
- void _parseColdDefFile () {
133
- columns.clear ();
134
- ifstream columnsFile (coldefFileName);
135
- if (!columnsFile) {
136
- throw runtime_error (" Error while opening the columns file." );
138
+ void _parseConfigFile () {
139
+ ifstream file (configFileName, ios_base::in);
140
+ if (!file.good ()) throw invalid_argument (" Failed to open file: '" + configFileName + " '" );
141
+ json config;
142
+ try {
143
+ file >> config;
144
+ } catch (...) {
145
+ throw runtime_error (" Config file: '" + configFileName + " ' doesn't have a valid JSON payload" );
137
146
}
138
- string column;
139
- while (columnsFile >> column) {
140
- columns.push_back (column);
147
+ if (!config.is_object ()) {
148
+ throw invalid_argument (" Config file: '" + configFileName + " ' is not a valid JSON object" );
141
149
}
150
+ if (!config.contains (" columns" )) {
151
+ throw runtime_error (" The JSON file must contain a 'columns' key." );
152
+ }
153
+ if (!config[" columns" ].is_array ()) {
154
+ throw runtime_error (" The 'columns' key must contain an array." );
155
+ }
156
+ columns = config[" columns" ].get <vector<string>>();
142
157
if (columns.empty ()) {
143
158
throw runtime_error (" No columns to be extracted." );
144
159
}
160
+ optionalColumns.clear ();
161
+ if (config.contains (" optional" )) {
162
+ if (!config[" optional" ].is_array ()) {
163
+ throw runtime_error (" The 'optional' key must contain an object." );
164
+ }
165
+ for (auto const & column : config[" optional" ].get <vector<string>>()) {
166
+ optionalColumns.insert (column);
167
+ }
168
+ }
169
+ // All optional columns must be defined in the 'columns' array.
170
+ for (auto const & name : optionalColumns) {
171
+ if (find (columns.begin (), columns.end (), name) == columns.end ()) {
172
+ throw runtime_error (" The optional column '" + name +
173
+ " ' is not defined in the 'columns' array." );
174
+ }
175
+ }
145
176
}
146
177
};
147
178
@@ -163,18 +194,20 @@ int main(int argc, char const* const* argv) {
163
194
cout << " Translating '" << params.parqFileName << " ' into '" << params.csvFileName << " '" << endl;
164
195
}
165
196
part::ParquetFile parqFile (params.parqFileName , params.maxMemAllocatedMB );
166
- if (parqFile.setupBatchReader (maxBuffSizeBytes) != arrow::Status::OK ()) {
167
- throw runtime_error (" Error while setting up the batch reader." );
168
- }
197
+ parqFile.setupBatchReader (maxBuffSizeBytes);
198
+
169
199
ofstream csvFile (params.csvFileName , ios::out | ios::binary);
170
200
if (!csvFile) {
171
201
throw runtime_error (" Error while opening the output file." );
172
202
}
173
203
while (true ) {
174
- auto status = parqFile.readNextBatch_Table2CSV (buf.get (), buffSize, params.columns ,
175
- params.optionalColumnDefs , params.nullStr ,
176
- params.delimStr );
177
- if ((status != arrow::Status::OK ()) || (buffSize == 0 )) break ;
204
+ bool const success = parqFile.readNextBatch_Table2CSV (buf.get (), buffSize, params.columns ,
205
+ params.optionalColumns , params.nullStr ,
206
+ params.delimStr , params.quote );
207
+ if (!success) break ;
208
+ if (buffSize == 0 ) {
209
+ throw runtime_error (" Received EOF while reading the file." );
210
+ }
178
211
if (params.verbose ) {
179
212
cout << " Writing " << setw (9 ) << buffSize << " bytes" << endl;
180
213
}
0 commit comments