22
22
#include < iomanip>
23
23
#include < iostream>
24
24
#include < fstream>
25
- #include < map>
26
25
#include < memory>
26
+ #include < set>
27
27
#include < stdexcept>
28
28
#include < string>
29
29
#include < vector>
30
30
31
+ // Third party headers
31
32
#include " boost/program_options.hpp"
33
+ #include " nlohmann/json.hpp"
32
34
33
35
#include " partition/ParquetInterface.h"
34
36
35
37
namespace po = boost::program_options;
36
38
namespace part = lsst::partition;
37
39
38
40
using namespace std ;
41
+ using json = nlohmann::json;
39
42
40
43
namespace {
41
44
@@ -64,15 +67,15 @@ class CommandLineParams {
64
67
" Max size (MB) of RAM allocated to the process." );
65
68
desc.add_options ()(" buf-size-mb" , po::value<int >()->default_value (maxBuffSizeMB),
66
69
" Buffers size (MB) for translating batches." );
67
- desc.add_options ()(" parq-file " , po::value<vector<string>>(), " Input file to be translated." );
68
- desc.add_options ()(" coldef-file " , po::value<vector<string>>(),
69
- " Input file with the names of columns to be extracted." );
70
- desc.add_options ()(" csv-file " , po::value<vector<string>>(), " Output file to be written." );
70
+ desc.add_options ()(" parq" , po::value<vector<string>>(), " Input file to be translated." );
71
+ desc.add_options ()(" config " , po::value<vector<string>>(),
72
+ " Input JSON file with definition of columns to be extracted." );
73
+ desc.add_options ()(" csv" , po::value<vector<string>>(), " Output file to be written." );
71
74
72
75
po::positional_options_description p;
73
- p.add (" parq-file " , 1 );
74
- p.add (" coldef-file " , 1 );
75
- p.add (" csv-file " , 1 );
76
+ p.add (" parq" , 1 );
77
+ p.add (" config " , 1 );
78
+ p.add (" csv" , 1 );
76
79
77
80
po::variables_map vm;
78
81
po::store (po::command_line_parser (argc, argv).options (desc).positional (p).run (), vm);
@@ -82,17 +85,17 @@ class CommandLineParams {
82
85
cout << desc << " \n " ;
83
86
return false ;
84
87
}
85
- parqFileName = vm.count (" parq-file " ) ? vm[" parq-file " ].as <vector<string>>().front () : string ();
86
- coldefFileName = vm.count (" coldef-file " ) ? vm[" coldef-file " ].as <vector<string>>().front () : string ();
87
- csvFileName = vm.count (" csv-file " ) ? vm[" csv-file " ].as <vector<string>>().front () : string ();
88
+ parqFileName = vm.count (" parq" ) ? vm[" parq" ].as <vector<string>>().front () : string ();
89
+ configFileName = vm.count (" config " ) ? vm[" config " ].as <vector<string>>().front () : string ();
90
+ csvFileName = vm.count (" csv" ) ? vm[" csv" ].as <vector<string>>().front () : string ();
88
91
89
- if (parqFileName.empty () || coldefFileName .empty () || csvFileName.empty ()) {
92
+ if (parqFileName.empty () || configFileName .empty () || csvFileName.empty ()) {
90
93
throw runtime_error (" The names of all required files must be provided." );
91
94
}
92
95
if (csvFileName == parqFileName) {
93
96
throw runtime_error (" Input and output file names must be different." );
94
97
}
95
- _parseColdDefFile ();
98
+ _parseConfigFile ();
96
99
97
100
if (vm.count (" max-mem-alloc-mb" )) {
98
101
maxMemAllocatedMB = vm[" max-mem-alloc-mb" ].as <int >();
@@ -114,11 +117,11 @@ class CommandLineParams {
114
117
// Values of the parsed parameters aere stored in the data members defined below.
115
118
116
119
string parqFileName;
117
- string coldefFileName ;
120
+ string configFileName ;
118
121
string csvFileName;
119
122
120
123
vector<string> columns;
121
- map <string, string> optionalColumnDefs ;
124
+ set <string> optionalColumns ;
122
125
123
126
int maxMemAllocatedMB = 3000 ;
124
127
int maxBuffSizeMB = 16 ;
@@ -129,19 +132,44 @@ class CommandLineParams {
129
132
bool verbose = false ;
130
133
131
134
private:
132
- void _parseColdDefFile () {
133
- columns.clear ();
134
- ifstream columnsFile (coldefFileName);
135
- if (!columnsFile) {
136
- throw runtime_error (" Error while opening the columns file." );
135
+ void _parseConfigFile () {
136
+ ifstream file (configFileName, ios_base::in);
137
+ if (!file.good ()) throw invalid_argument (" Failed to open file: '" + configFileName + " '" );
138
+ json config;
139
+ try {
140
+ file >> config;
141
+ } catch (...) {
142
+ throw runtime_error (" Config file: '" + configFileName + " ' doesn't have a valid JSON payload" );
137
143
}
138
- string column;
139
- while (columnsFile >> column) {
140
- columns.push_back (column);
144
+ if (!config.is_object ()) {
145
+ throw invalid_argument (" Config file: '" + configFileName + " ' is not a valid JSON object" );
141
146
}
147
+ if (!config.contains (" columns" )) {
148
+ throw runtime_error (" The JSON file must contain a 'columns' key." );
149
+ }
150
+ if (!config[" columns" ].is_array ()) {
151
+ throw runtime_error (" The 'columns' key must contain an array." );
152
+ }
153
+ columns = config[" columns" ].get <vector<string>>();
142
154
if (columns.empty ()) {
143
155
throw runtime_error (" No columns to be extracted." );
144
156
}
157
+ optionalColumns.clear ();
158
+ if (config.contains (" optional" )) {
159
+ if (!config[" optional" ].is_array ()) {
160
+ throw runtime_error (" The 'optional' key must contain an object." );
161
+ }
162
+ for (auto const & column : config[" optional" ].get <vector<string>>()) {
163
+ optionalColumns.insert (column);
164
+ }
165
+ }
166
+ // All optional columns must be defined in the 'columns' array.
167
+ for (auto const & name : optionalColumns) {
168
+ if (find (columns.begin (), columns.end (), name) == columns.end ()) {
169
+ throw runtime_error (" The optional column '" + name +
170
+ " ' is not defined in the 'columns' array." );
171
+ }
172
+ }
145
173
}
146
174
};
147
175
@@ -163,23 +191,24 @@ int main(int argc, char const* const* argv) {
163
191
cout << " Translating '" << params.parqFileName << " ' into '" << params.csvFileName << " '" << endl;
164
192
}
165
193
part::ParquetFile parqFile (params.parqFileName , params.maxMemAllocatedMB );
166
- if (parqFile.setupBatchReader (maxBuffSizeBytes) != arrow::Status::OK ()) {
167
- throw runtime_error (" Error while setting up the batch reader." );
168
- }
194
+ parqFile.setupBatchReader (maxBuffSizeBytes);
195
+
169
196
ofstream csvFile (params.csvFileName , ios::out | ios::binary);
170
197
if (!csvFile) {
171
198
throw runtime_error (" Error while opening the output file." );
172
199
}
173
200
while (true ) {
174
- auto status = parqFile.readNextBatch_Table2CSV (buf.get (), buffSize, params.columns ,
175
- params.optionalColumnDefs , params.nullStr ,
176
- params.delimStr );
177
- if ((status != arrow::Status::OK ()) || (buffSize == 0 )) break ;
178
- if (params.verbose ) {
179
- cout << " Writing " << setw (9 ) << buffSize << " bytes" << endl;
201
+ bool const hasMore =
202
+ parqFile.readNextBatch_Table2CSV (buf.get (), buffSize, params.columns ,
203
+ params.optionalColumns , params.nullStr , params.delimStr );
204
+ if (buffSize > 0 ) {
205
+ if (params.verbose ) {
206
+ cout << " Writing " << setw (9 ) << buffSize << " bytes" << endl;
207
+ }
208
+ csvFile.write ((char *)(buf.get ()), buffSize);
209
+ numBytesWritten += buffSize;
180
210
}
181
- csvFile.write ((char *)(buf.get ()), buffSize);
182
- numBytesWritten += buffSize;
211
+ if (!hasMore || (buffSize == 0 )) break ;
183
212
}
184
213
csvFile.close ();
185
214
if (params.verbose ) {
0 commit comments