12
12
#include < myhtml/serialization.h>
13
13
#include < mycss/selectors/serialization.h>
14
14
#include < modest/finder/finder.h>
15
+ #include < fmt/core.h>
15
16
16
17
using namespace std ;
17
18
@@ -27,6 +28,9 @@ Usage: %s [options] <selector> <mode> [mode argument]
27
28
delimiter character to use between results (defaults to newline)
28
29
-0, --null
29
30
uses \0 as delimiter
31
+ -F, --format <selector> <format string>
32
+ specify custom format string for element stringification (can be specified multiple times)
33
+ example: `-F a '->{}<-'` - renders <a> text wrapped in '->' and '<-'
30
34
31
35
<selector>
32
36
CSS selector to match against
@@ -53,11 +57,18 @@ static const string afmt_e = "m";
53
57
static const vector<char > collapsible = {' ' , ' \t ' , ' \n ' , ' \r ' };
54
58
static const vector<myhtml_tag_id_t > breaking = {
55
59
MyHTML_TAG_BR,
56
- MyHTML_TAG_P
60
+ MyHTML_TAG_P,
61
+ MyHTML_TAG_H1,
62
+ MyHTML_TAG_H2,
63
+ MyHTML_TAG_H3,
64
+ MyHTML_TAG_H4,
65
+ MyHTML_TAG_H5,
66
+ MyHTML_TAG_H6,
67
+ MyHTML_TAG_HR,
57
68
};
58
69
59
- static map<const string, bool > flags = {
60
- {" dirtyargs" , false }
70
+ static map<const string, int > flags = {
71
+ {" dirtyargs" , 0 }
61
72
};
62
73
63
74
static map<const string, string> state = { // global state
@@ -67,15 +78,16 @@ static map<const string, string> state = { // global state
67
78
{" selector" , " " }, // matching selector
68
79
{" mode" , " " }, // output mode
69
80
{" data" , " " }, // read input data
70
- {" modearg" , " " } // mode argument (optional)
81
+ {" modearg" , " " }, // mode argument (optional)
82
+ {" scratch" , " " }, // scratchpad value (internal use)
71
83
};
72
84
73
85
bool readarg (int &argc, const char ** &argv, string argname, const bool die_on_err = true ){
74
86
if (argc > 1 ){
75
- state[argname] = argv[1 ];
76
87
argv++;
77
88
argc--;
78
- flags[" dirtyargs" ] = true ;
89
+ state[argname] = *argv;
90
+ flags[" dirtyargs" ]++;
79
91
return true ;
80
92
}else {
81
93
if (die_on_err){
@@ -110,6 +122,10 @@ template <typename ...T> inline bool node_in(myhtml_tree_node_t* node, T... tags
110
122
return false ;
111
123
}
112
124
125
+ bool node_sort (myhtml_tree_node_t * lhs, myhtml_tree_node_t * rhs){
126
+ return myhtml_node_element_position (lhs).begin < myhtml_node_element_position (rhs).begin ;
127
+ }
128
+
113
129
template <typename ...T> inline bool node_before (myhtml_tree_node_t * node, T... tags){
114
130
while ((node = node->next ) && node->tag_id <= 0x003 );
115
131
@@ -134,9 +150,21 @@ static map<const char, const string> option_longopts = { // maps shortopts to lo
134
150
{' h' , " help" },
135
151
{' f' , " file" },
136
152
{' d' , " delimiter" },
137
- {' 0' , " zero" }
153
+ {' 0' , " zero" },
154
+ {' F' , " format" },
138
155
};
139
156
157
+ vector<tuple<string, string, myhtml_collection_t *>> selector_format = {};
158
+
159
+ const char * format_node (myhtml_tree_node_t * node){
160
+ for (auto & [fselect, fstr, fcollect] : selector_format)
161
+ if (fcollect)
162
+ for (myhtml_tree_node_t * select_node : vector<myhtml_tree_node_t *>(fcollect->list , fcollect->list +fcollect->length ))
163
+ if (node == select_node) return fstr.c_str ();
164
+
165
+ return " {}" ;
166
+ }
167
+
140
168
static map<const string, const function<void (int &, const char **&)>> option_handlers = { // maps longopts to functions
141
169
{" help" , [](int &argc, const char ** &argv) {
142
170
fprintf (stderr, helptext, state[" progname" ].c_str (), state[" progname" ].c_str (), state[" progname" ].c_str ());
@@ -152,7 +180,29 @@ static map<const string, const function<void(int&, const char**&)>> option_handl
152
180
}},
153
181
{" zero" , [](int &argc, const char ** &argv) {
154
182
state[" delim" ] = " \0 " ;
155
- }}
183
+ }},
184
+ {" format" , [](int &argc, const char ** &argv) {
185
+ argv++, argc--;
186
+ if (!readarg (argc, argv, " scratch" , false )){
187
+ cerr << " missing selector in --format" << endl;
188
+ exit (EXIT_FAILURE);
189
+ }
190
+ string fselect = state[" scratch" ];
191
+ if (!readarg (argc, argv, " scratch" , false )){
192
+ cerr << " missing format string in --format" << endl;
193
+ exit (EXIT_FAILURE);
194
+ }
195
+ string form = state[" scratch" ];
196
+
197
+ if (fselect.length () == 0 ){
198
+ cerr << " invalid --format " << fselect << " " << form << endl;
199
+ exit (EXIT_FAILURE);
200
+ }
201
+
202
+ selector_format.push_back (tuple<string, string, myhtml_collection_t *>(fselect, form, nullptr ));
203
+
204
+ argv--, argc++;
205
+ }},
156
206
};
157
207
158
208
static pair<const function<void (myhtml_tree_node_t *, string&)>, const function<void (myhtml_tree_node_t *, string&)>> format_handlers = { // {format, unformat}
@@ -169,6 +219,12 @@ static pair<const function<void(myhtml_tree_node_t*, string&)>, const function<v
169
219
case MyHTML_TAG_I: // italics on
170
220
case MyHTML_TAG_U:
171
221
case MyHTML_TAG_EM:
222
+ case MyHTML_TAG_H1:
223
+ case MyHTML_TAG_H2:
224
+ case MyHTML_TAG_H3:
225
+ case MyHTML_TAG_H4:
226
+ case MyHTML_TAG_H5:
227
+ case MyHTML_TAG_H6:
172
228
if (ansi) rendered += afmt_s + " 4" + afmt_e;
173
229
if (md) rendered += " _" ;
174
230
break ;
@@ -201,6 +257,12 @@ static pair<const function<void(myhtml_tree_node_t*, string&)>, const function<v
201
257
case MyHTML_TAG_I: // italics off
202
258
case MyHTML_TAG_U:
203
259
case MyHTML_TAG_EM:
260
+ case MyHTML_TAG_H1:
261
+ case MyHTML_TAG_H2:
262
+ case MyHTML_TAG_H3:
263
+ case MyHTML_TAG_H4:
264
+ case MyHTML_TAG_H5:
265
+ case MyHTML_TAG_H6:
204
266
if (ansi) rendered += afmt_s + " 24" + afmt_e; // no italics here :(
205
267
if (md) rendered += " _" ;
206
268
break ;
@@ -225,6 +287,11 @@ static pair<const function<void(myhtml_tree_node_t*, string&)>, const function<v
225
287
rendered += " \t " ;
226
288
}
227
289
break ;
290
+ case MyHTML_TAG_TR:
291
+ if (rendered.back () != ' \n ' ){
292
+ rendered += " \n " ;
293
+ }
294
+ break ;
228
295
}
229
296
230
297
if (vec_has (breaking, node_iter->tag_id )){ // <br/>
@@ -233,56 +300,57 @@ static pair<const function<void(myhtml_tree_node_t*, string&)>, const function<v
233
300
}
234
301
};
235
302
303
+ string render_node (myhtml_tree_node_t * node_iter){
304
+ string rendered = " " ;
305
+
306
+ if (node_iter->tag_id == MyHTML_TAG_STYLE) return rendered;
307
+
308
+ format_handlers.first (node_iter, rendered);
309
+
310
+ if (node_iter->tag_id == MyHTML_TAG__TEXT){
311
+ string text (myhtml_node_text (node_iter, nullptr ));
312
+ if (!node_in (node_iter, MyHTML_TAG_PRE)){
313
+ // collapse whitespace to single character
314
+ string::iterator nend = unique (text.begin (), text.end (), [](char c1, char c2) -> bool {
315
+ return vec_has (collapsible, c1) && vec_has (collapsible, c2);
316
+ });
317
+ text.resize (static_cast <unsigned long >(nend-text.begin ()));
318
+
319
+ // replace whitespace with space
320
+ replace_if (text.begin (), text.end (), [](char c) -> bool {
321
+ return vec_has (collapsible, c);
322
+ }, ' ' );
323
+ }
324
+
325
+ rendered += text;
326
+ }
327
+
328
+ if (node_iter->child ){
329
+ rendered += render_node (node_iter->child );
330
+ }
331
+
332
+ rendered = fmt::format (format_node (node_iter), rendered);
333
+
334
+ format_handlers.second (node_iter, rendered);
335
+
336
+ if ((node_iter = node_iter->next )){
337
+ rendered += render_node (node_iter);
338
+ }
339
+
340
+ return rendered;
341
+ }
342
+
236
343
static map<const string, const function<void (myhtml_tree_node_t *)>> mode_handlers = { // maps modes to functions
237
344
{" data" , [](myhtml_tree_node_t * node) {
238
345
myhtml_serialization_tree_callback (node, [](const char * data, size_t len, void * ctx) -> unsigned int {
239
- printf (" %.*s " , static_cast < int >(len) , data);
346
+ printf (" %s " , data);
240
347
return 0 ;
241
- }, nullptr );
348
+ }, node );
242
349
printf (" %c" , state[" delim" ][0 ]);
243
350
}},
244
351
245
352
{" text" , [](myhtml_tree_node_t * node) {
246
- string rendered = " " ;
247
-
248
- myhtml_tree_node_t * node_iter = node->child ;
249
- while (node_iter){
250
- const char * text_c = myhtml_node_text (node_iter, nullptr );
251
- string text = " " ;
252
- if (text_c != nullptr ) text += text_c;
253
-
254
- if (node_iter->tag_id == MyHTML_TAG__TEXT){
255
- if (!node_in (node_iter, MyHTML_TAG_PRE)){
256
- // collapse whitespace to single character
257
- string::iterator nend = unique (text.begin (), text.end (), [](char c1, char c2) -> bool {
258
- return vec_has (collapsible, c1) && vec_has (collapsible, c2);
259
- });
260
- text.resize (static_cast <unsigned long >(nend-text.begin ()));
261
-
262
- // replace whitespace with space
263
- replace_if (text.begin (), text.end (), [](char c) -> bool {
264
- return vec_has (collapsible, c);
265
- }, ' ' );
266
- }
267
-
268
- rendered += text;
269
- }else {
270
- format_handlers.first (node_iter, rendered);
271
- }
272
-
273
- if (node_iter->child ) node_iter = node_iter->child ;
274
- else {
275
- while (node_iter != node && node_iter->next == nullptr ){
276
- format_handlers.second (node_iter, rendered);
277
-
278
- node_iter = node_iter->parent ;
279
- }
280
- if (node_iter == node) break ;
281
-
282
- format_handlers.second (node_iter, rendered);
283
- node_iter = node_iter->next ;
284
- }
285
- }
353
+ string rendered = render_node (node->child );
286
354
287
355
size_t index = 0 ;
288
356
while ((index = rendered.find (" \n " , index)) != string::npos){ // clear whitespace before multiline content
@@ -296,7 +364,8 @@ static map<const string, const function<void(myhtml_tree_node_t*)>> mode_handler
296
364
while (vec_has (collapsible, rendered[0 ])) rendered.erase (0 , 1 ); // clear whitespace before single-line content
297
365
while (vec_has (collapsible, *(rendered.end ()-1 ))) rendered.erase (rendered.length ()-1 , 1 ); // clear whitespace after single-line content
298
366
299
- cout << rendered;
367
+ fmt::print (format_node (node), rendered);
368
+ // printf(fmt, rendered);
300
369
printf (" %c" , state[" delim" ][0 ]);
301
370
}},
302
371
@@ -314,7 +383,7 @@ static map<const string, const function<void(myhtml_tree_node_t*)>> mode_handler
314
383
315
384
do {
316
385
if (state[" modearg" ] == mycore_string_data (&attr->key )){
317
- cout << mycore_string_data (&attr->value );
386
+ fmt::print ( format_node (node), mycore_string_data (&attr->value ) );
318
387
printf (" %c" , state[" delim" ][0 ]);
319
388
}
320
389
}while (attr != token->attr_last && (attr = attr->next )); // move attr pointer further & loop if attr_last not hit
@@ -343,8 +412,8 @@ void parseopts(int &argc, const char** &argv){
343
412
cerr << " invalid short option '-" << argv[1 ][0 ] << " '" << endl;
344
413
exit (EXIT_FAILURE);
345
414
}
346
- if (flags[" dirtyargs" ]){ // option handler touched argv (args?); skip
347
- flags[" dirtyargs" ] = false ;
415
+ if (flags[" dirtyargs" ] > 0 ){ // option handler touched argv (args?); skip
416
+ flags[" dirtyargs" ]-- ;
348
417
break ;
349
418
}
350
419
}
@@ -406,9 +475,25 @@ int main(int argc, const char* argv[]){
406
475
myhtml_collection_t * collection = nullptr ;
407
476
modest_finder_by_selectors_list (finder, html_tree->node_html , selectors_list, &collection);
408
477
478
+ for (auto & [fselect, fstr, fcollect] : selector_format){
479
+ mycss_selectors_list_t * fselect_parsed = mycss_selectors_parse (
480
+ mycss_entry_selectors (css_entry),
481
+ MyENCODING_UTF_8,
482
+ fselect.c_str (), fselect.length (),
483
+ &mystatus
484
+ );
485
+ if (fselect_parsed == nullptr || (fselect_parsed->flags & MyCSS_SELECTORS_FLAGS_SELECTOR_BAD)){
486
+ cerr << " bad format selector '" << fselect << " '" << endl;
487
+ exit (EXIT_FAILURE);
488
+ }
489
+ modest_finder_by_selectors_list (finder, html_tree->node_html , fselect_parsed, &fcollect);
490
+ }
491
+
409
492
if (collection){
493
+ vector<myhtml_tree_node_t *> nodes (collection->list , collection->list +collection->length );
494
+ sort (nodes.begin (), nodes.end (), node_sort);
410
495
try {
411
- for (myhtml_tree_node_t * node : vector< myhtml_tree_node_t *>(collection-> list , collection-> list +collection-> length ) ){
496
+ for (myhtml_tree_node_t * node : nodes ){
412
497
mode_handlers[state[" mode" ]](node);
413
498
}
414
499
}catch (bad_function_call&){
0 commit comments