From bc4d0765bc136a36351c2cde09fb59635348cbc5 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 7 Sep 2015 22:47:01 -0400 Subject: [PATCH] CSV read performance: fix #51 --- c/Makefile | 7 +- c/cli/mlrcli.c | 12 +- c/experimental/csv0.c | 140 --------- c/experimental/pfr.c | 25 -- ...{lrec_reader_csvex.c => lrec_reader_csv.c} | 50 +-- c/input/lrec_reader_stdio_csv.c | 288 ------------------ c/input/lrec_readers.c | 4 +- c/input/lrec_readers.h | 9 +- c/input/old_peek_file_reader.c | 89 ------ c/input/old_peek_file_reader.h | 28 -- 10 files changed, 31 insertions(+), 621 deletions(-) delete mode 100644 c/experimental/csv0.c delete mode 100644 c/experimental/pfr.c rename c/input/{lrec_reader_csvex.c => lrec_reader_csv.c} (88%) delete mode 100644 c/input/lrec_reader_stdio_csv.c delete mode 100644 c/input/old_peek_file_reader.c delete mode 100644 c/input/old_peek_file_reader.h diff --git a/c/Makefile b/c/Makefile index 558156fd84..65ba574165 100644 --- a/c/Makefile +++ b/c/Makefile @@ -51,9 +51,7 @@ TEST_LREC_SRCS = lib/mlrutil.c lib/mlr_globals.c lib/string_builder.c \ containers/lrec.c containers/header_keeper.c containers/sllv.c \ containers/slls.c containers/lhmslv.c \ input/file_reader_mmap.c input/file_reader_stdio.c \ -input/lrec_reader_stdio_csv.c \ input/lrec_reader_mmap_csvlite.c input/lrec_reader_stdio_csvlite.c \ -input/old_peek_file_reader.c \ input/lrec_reader_mmap_dkvp.c input/lrec_reader_stdio_dkvp.c \ input/lrec_reader_mmap_nidx.c input/lrec_reader_stdio_nidx.c \ input/lrec_reader_mmap_xtab.c input/lrec_reader_stdio_xtab.c \ @@ -70,14 +68,12 @@ containers/join_bucket_keeper.c \ input/mmap_byte_reader.c \ input/stdio_byte_reader.c \ input/lrec_reader_in_memory.c input/lrec_readers.c \ -input/lrec_reader_stdio_csv.c \ -input/lrec_reader_csvex.c \ +input/lrec_reader_csv.c \ input/lrec_reader_mmap_csvlite.c input/lrec_reader_stdio_csvlite.c \ input/lrec_reader_mmap_dkvp.c input/lrec_reader_stdio_dkvp.c \ input/lrec_reader_mmap_nidx.c input/lrec_reader_stdio_nidx.c \ input/lrec_reader_mmap_xtab.c input/lrec_reader_stdio_xtab.c \ input/file_reader_mmap.c input/file_reader_stdio.c \ -input/old_peek_file_reader.c \ input/peek_file_reader.c \ containers/test_join_bucket_keeper.c @@ -86,7 +82,6 @@ lib/mlrutil.c \ lib/mlr_globals.c \ lib/string_builder.c \ input/file_reader_mmap.c \ -input/old_peek_file_reader.c \ experimental/getlines.c # ================================================================ diff --git a/c/cli/mlrcli.c b/c/cli/mlrcli.c index e23e6dcc5f..f9df7c4aab 100644 --- a/c/cli/mlrcli.c +++ b/c/cli/mlrcli.c @@ -288,24 +288,14 @@ cli_opts_t* parse_command_line(int argc, char** argv) { argi++; } -// ready for cutover! :) -// just leaving it off for a while to compare performance. -#if 0 - else if (streq(argv[argi], "--csv")) { popts->ifmt = "csvex"; ofmt = "csv"; } - else if (streq(argv[argi], "--icsv")) { popts->ifmt = "csvex"; } - else if (streq(argv[argi], "--ocsv")) { ofmt = "csv"; } -#else else if (streq(argv[argi], "--csv")) { popts->ifmt = ofmt = "csv"; } else if (streq(argv[argi], "--icsv")) { popts->ifmt = "csv"; } else if (streq(argv[argi], "--ocsv")) { ofmt = "csv"; } -#endif + else if (streq(argv[argi], "--csvlite")) { popts->ifmt = ofmt = "csvlite"; } else if (streq(argv[argi], "--icsvlite")) { popts->ifmt = "csvlite"; } else if (streq(argv[argi], "--ocsvlite")) { ofmt = "csvlite"; } - else if (streq(argv[argi], "--csvex")) { popts->ifmt = "csvex"; ofmt = "csv"; } - else if (streq(argv[argi], "--icsvex")) { popts->ifmt = "csvex"; } - else if (streq(argv[argi], "--dkvp")) { popts->ifmt = ofmt = "dkvp"; } else if (streq(argv[argi], "--idkvp")) { popts->ifmt = "dkvp"; } else if (streq(argv[argi], "--odkvp")) { ofmt = "dkvp"; } diff --git a/c/experimental/csv0.c b/c/experimental/csv0.c deleted file mode 100644 index f656a0a3ef..0000000000 --- a/c/experimental/csv0.c +++ /dev/null @@ -1,140 +0,0 @@ -#include -#include -#include -#include "lib/mlrutil.h" -#include "containers/slls.h" -#include "lib/string_builder.h" -#include "input/old_peek_file_reader.h" - -#define TERMIND_RS 0x1111 -#define TERMIND_FS 0x2222 -#define TERMIND_EOF 0x3333 - -typedef struct _field_wrapper_t { - char* contents; - int termind; -} field_wrapper_t; - -typedef struct _record_wrapper_t { - slls_t* contents; - int at_eof; -} record_wrapper_t; - -static field_wrapper_t get_csv_field_not_dquoted(old_peek_file_reader_t* pfr, string_builder_t* psb) { - // Note that "\"," etc. will be encoded in the rfc_csv_reader_t ctor -- this is just sketch - printf("\n"); - printf("ENTER\n"); - while (TRUE) { - if (old_pfr_at_eof(pfr)) { - printf("--case 1\n"); - printf("EXIT\n"); - return (field_wrapper_t) { .contents = sb_is_empty(psb) ? NULL: sb_finish(psb), .termind = TERMIND_EOF }; - } else if (old_pfr_next_is(pfr, ",\xff", 2)) { - printf("--case 2\n"); - old_pfr_advance_by(pfr, 2); - printf("EXIT\n"); - return (field_wrapper_t) { .contents = sb_finish(psb), .termind = TERMIND_EOF }; - } else if (old_pfr_next_is(pfr, ",", 1)) { - printf("--case 3\n"); - old_pfr_advance_by(pfr, 1); - printf("EXIT\n"); - return (field_wrapper_t) { .contents = sb_finish(psb), .termind = TERMIND_FS }; - } else if (old_pfr_next_is(pfr, "\r\n", 2)) { - printf("--case 4\n"); - old_pfr_advance_by(pfr, 2); - printf("EXIT\n"); - return (field_wrapper_t) { .contents = sb_finish(psb), .termind = TERMIND_RS }; - } else { - //old_pfr_dump(pfr); - char c = old_pfr_read_char(pfr); - printf("--case 5 %c [%02x]\n", isprint(c) ? c : '?', c); - //old_pfr_dump(pfr); - sb_append_char(psb, c); - //sb_append_char(psb, old_pfr_read_char(pfr)); - } - } -} - -static field_wrapper_t get_csv_field_dquoted(old_peek_file_reader_t* pfr, string_builder_t* psb) { - old_pfr_advance_by(pfr, 1); - while (TRUE) { - if (old_pfr_at_eof(pfr)) { - // xxx imbalanced-dquote error - fprintf(stderr, "xxx k0d3 me up b04k3n b04k3n b04ken %d\n", __LINE__); - exit(1); - } else if (old_pfr_next_is(pfr, "\"\xff", 2)) { - old_pfr_advance_by(pfr, 2); - return (field_wrapper_t) { .contents = sb_finish(psb), .termind = TERMIND_EOF }; - } else if (old_pfr_next_is(pfr, "\",", 2)) { - old_pfr_advance_by(pfr, 2); - return (field_wrapper_t) { .contents = sb_finish(psb), .termind = TERMIND_FS }; - } else if (old_pfr_next_is(pfr, "\"\r\n", 3)) { - old_pfr_advance_by(pfr, 3); - return (field_wrapper_t) { .contents = sb_finish(psb), .termind = TERMIND_RS }; - } else { - sb_append_char(psb, old_pfr_read_char(pfr)); - } - } -} - -field_wrapper_t get_csv_field(old_peek_file_reader_t* pfr, string_builder_t* psb) { - field_wrapper_t wrapper; - if (old_pfr_at_eof(pfr)) { - wrapper.contents = NULL; - wrapper.termind = TERMIND_EOF; - return wrapper; - } else if (old_pfr_next_is(pfr, "\"", 1)) { - return get_csv_field_dquoted(pfr, psb); - } else { - return get_csv_field_not_dquoted(pfr, psb); - } -} - -record_wrapper_t get_csv_record(old_peek_file_reader_t* pfr, string_builder_t* psb) { - slls_t* fields = slls_alloc(); - record_wrapper_t rwrapper; - rwrapper.contents = fields; - rwrapper.at_eof = FALSE; - while (TRUE) { - field_wrapper_t fwrapper = get_csv_field(pfr, psb); - if (fwrapper.termind == TERMIND_EOF) { - rwrapper.at_eof = TRUE; - } - if (fwrapper.contents != NULL) { - printf("CONT=>>%s<<[%d]\n", fwrapper.contents, (int)strlen(fwrapper.contents)); - slls_add_with_free(fields, fwrapper.contents); - } - if (fwrapper.termind != TERMIND_FS) - break; - } - printf("FLEN=%d\n", fields->length); - printf("FEOF=%d\n", rwrapper.at_eof); - if (fields->length == 0 && rwrapper.at_eof) { - slls_free(fields); - rwrapper.contents = NULL; - } - return rwrapper; -} - -int main() { - FILE* fp = stdin; - old_peek_file_reader_t* pfr = old_pfr_alloc(fp, 32); - string_builder_t sb; - string_builder_t* psb = &sb; - sb_init(psb, 1024); - - while (TRUE) { - record_wrapper_t rwrapper = get_csv_record(pfr, psb); - if (rwrapper.contents != NULL) { - printf("++++ [NF=%d]\n", rwrapper.contents->length); - for (sllse_t* pe = rwrapper.contents->phead; pe != NULL; pe = pe->pnext) { - printf(" [%s]\n", pe->value); - } - slls_free(rwrapper.contents); - } - if (rwrapper.at_eof) - break; - } - - return 0; -} diff --git a/c/experimental/pfr.c b/c/experimental/pfr.c deleted file mode 100644 index 67453648ab..0000000000 --- a/c/experimental/pfr.c +++ /dev/null @@ -1,25 +0,0 @@ -#include -#include -#include "input/old_peek_file_reader.h" - -int main(int argc, char** argv) { - FILE* fp = stdin; - old_peek_file_reader_t* pfr = old_pfr_alloc(fp, 32); - - printf("@eof = %d\n", old_pfr_at_eof(pfr)); - old_pfr_dump(pfr); - printf("read 0x%02x\n", (unsigned)old_pfr_read_char(pfr)); - old_pfr_dump(pfr); - char* s = "//"; - old_pfr_dump(pfr); - printf("next is %s = %d\n", s, old_pfr_next_is(pfr, s, strlen(s))); - old_pfr_dump(pfr); - char c = old_pfr_read_char(pfr); - printf("read %c [0x%02x]\n", c, (unsigned)c); - old_pfr_dump(pfr); - - printf("@eof = %d\n", old_pfr_at_eof(pfr)); - - old_pfr_free(pfr); - return 0; -} diff --git a/c/input/lrec_reader_csvex.c b/c/input/lrec_reader_csv.c similarity index 88% rename from c/input/lrec_reader_csvex.c rename to c/input/lrec_reader_csv.c index dac9738dff..de4a9f4342 100644 --- a/c/input/lrec_reader_csvex.c +++ b/c/input/lrec_reader_csv.c @@ -37,7 +37,7 @@ #define DQUOTE_DQUOTE_STRIDX 0x2008 // ---------------------------------------------------------------- -typedef struct _lrec_reader_csvex_state_t { +typedef struct _lrec_reader_csv_state_t { // Input line number is not the same as the record-counter in context_t, // which counts records. long long ilno; @@ -76,19 +76,19 @@ typedef struct _lrec_reader_csvex_state_t { header_keeper_t* pheader_keeper; lhmslv_t* pheader_keepers; -} lrec_reader_csvex_state_t; +} lrec_reader_csv_state_t; -static slls_t* lrec_reader_csvex_get_fields(lrec_reader_csvex_state_t* pstate); -static lrec_t* paste_header_and_data(lrec_reader_csvex_state_t* pstate, slls_t* pdata_fields); +static slls_t* lrec_reader_csv_get_fields(lrec_reader_csv_state_t* pstate); +static lrec_t* paste_header_and_data(lrec_reader_csv_state_t* pstate, slls_t* pdata_fields); // ---------------------------------------------------------------- // xxx needs abend on null lhs. etc. -static lrec_t* lrec_reader_csvex_process(void* pvstate, void* pvhandle, context_t* pctx) { - lrec_reader_csvex_state_t* pstate = pvstate; +static lrec_t* lrec_reader_csv_process(void* pvstate, void* pvhandle, context_t* pctx) { + lrec_reader_csv_state_t* pstate = pvstate; if (pstate->expect_header_line_next) { - slls_t* pheader_fields = lrec_reader_csvex_get_fields(pstate); + slls_t* pheader_fields = lrec_reader_csv_get_fields(pstate); if (pheader_fields == NULL) return NULL; pstate->ilno++; @@ -104,14 +104,14 @@ static lrec_t* lrec_reader_csvex_process(void* pvstate, void* pvhandle, context_ } pstate->ilno++; - slls_t* pdata_fields = lrec_reader_csvex_get_fields(pstate); + slls_t* pdata_fields = lrec_reader_csv_get_fields(pstate); if (pdata_fields == NULL) // EOF return NULL; else return paste_header_and_data(pstate, pdata_fields); } -static slls_t* lrec_reader_csvex_get_fields(lrec_reader_csvex_state_t* pstate) { +static slls_t* lrec_reader_csv_get_fields(lrec_reader_csv_state_t* pstate) { int rc, stridx, matchlen, record_done, field_done; peek_file_reader_t* pfr = pstate->pfr; string_builder_t* psb = pstate->psb; @@ -239,7 +239,7 @@ static slls_t* lrec_reader_csvex_get_fields(lrec_reader_csvex_state_t* pstate) { } // ---------------------------------------------------------------- -static lrec_t* paste_header_and_data(lrec_reader_csvex_state_t* pstate, slls_t* pdata_fields) { +static lrec_t* paste_header_and_data(lrec_reader_csv_state_t* pstate, slls_t* pdata_fields) { if (pstate->pheader_keeper->pkeys->length != pdata_fields->length) { fprintf(stderr, "%s: Header/data length mismatch: %d != %d at line %lld.\n", MLR_GLOBALS.argv0, pstate->pheader_keeper->pkeys->length, pdata_fields->length, pstate->ilno); @@ -255,29 +255,29 @@ static lrec_t* paste_header_and_data(lrec_reader_csvex_state_t* pstate, slls_t* } // ---------------------------------------------------------------- -void* lrec_reader_csvex_open(void* pvstate, char* filename) { - lrec_reader_csvex_state_t* pstate = pvstate; +void* lrec_reader_csv_open(void* pvstate, char* filename) { + lrec_reader_csv_state_t* pstate = pvstate; pstate->pfr->pbr->popen_func(pstate->pfr->pbr, filename); pfr_reset(pstate->pfr); return NULL; // xxx modify the API after the functional refactor is complete } -void lrec_reader_csvex_close(void* pvstate, void* pvhandle) { - lrec_reader_csvex_state_t* pstate = pvstate; +void lrec_reader_csv_close(void* pvstate, void* pvhandle) { + lrec_reader_csv_state_t* pstate = pvstate; pstate->pfr->pbr->pclose_func(pstate->pfr->pbr); } // ---------------------------------------------------------------- // xxx after the pfr/pbr refactor is complete, vsof and vopen may be redundant. -static void lrec_reader_csvex_sof(void* pvstate) { - lrec_reader_csvex_state_t* pstate = pvstate; +static void lrec_reader_csv_sof(void* pvstate) { + lrec_reader_csv_state_t* pstate = pvstate; pstate->ilno = 0LL; pstate->expect_header_line_next = TRUE; } // ---------------------------------------------------------------- -static void lrec_reader_csvex_free(void* pvstate) { - lrec_reader_csvex_state_t* pstate = pvstate; +static void lrec_reader_csv_free(void* pvstate) { + lrec_reader_csv_state_t* pstate = pvstate; for (lhmslve_t* pe = pstate->pheader_keepers->phead; pe != NULL; pe = pe->pnext) { header_keeper_t* pheader_keeper = pe->pvvalue; header_keeper_free(pheader_keeper); @@ -286,10 +286,10 @@ static void lrec_reader_csvex_free(void* pvstate) { } // ---------------------------------------------------------------- -lrec_reader_t* lrec_reader_csvex_alloc(byte_reader_t* pbr, char irs, char ifs) { +lrec_reader_t* lrec_reader_csv_alloc(byte_reader_t* pbr, char irs, char ifs) { lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t)); - lrec_reader_csvex_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_csvex_state_t)); + lrec_reader_csv_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_csv_state_t)); pstate->ilno = 0LL; pstate->eof = "\xff"; @@ -349,11 +349,11 @@ lrec_reader_t* lrec_reader_csvex_alloc(byte_reader_t* pbr, char irs, char ifs) { pstate->pheader_keepers = lhmslv_alloc(); plrec_reader->pvstate = (void*)pstate; - plrec_reader->popen_func = &lrec_reader_csvex_open; - plrec_reader->pclose_func = &lrec_reader_csvex_close; - plrec_reader->pprocess_func = &lrec_reader_csvex_process; - plrec_reader->psof_func = &lrec_reader_csvex_sof; - plrec_reader->pfree_func = &lrec_reader_csvex_free; + plrec_reader->popen_func = &lrec_reader_csv_open; + plrec_reader->pclose_func = &lrec_reader_csv_close; + plrec_reader->pprocess_func = &lrec_reader_csv_process; + plrec_reader->psof_func = &lrec_reader_csv_sof; + plrec_reader->pfree_func = &lrec_reader_csv_free; return plrec_reader; } diff --git a/c/input/lrec_reader_stdio_csv.c b/c/input/lrec_reader_stdio_csv.c deleted file mode 100644 index 456b3fc0b8..0000000000 --- a/c/input/lrec_reader_stdio_csv.c +++ /dev/null @@ -1,288 +0,0 @@ -#include -#include -#include "lib/mlr_globals.h" -#include "lib/mlrutil.h" -#include "containers/slls.h" -#include "containers/lhmslv.h" -#include "input/file_reader_stdio.h" -#include "input/lrec_readers.h" -#include "lib/string_builder.h" -#include "input/old_peek_file_reader.h" - -// Idea of pheader_keepers: each header_keeper object retains the input-line backing -// and the slls_t for a CSV header line which is used by one or more CSV data -// lines. Meanwhile some mappers retain input records from the entire data -// stream, including header-schema changes in the input stream. This means we -// need to keep headers intact as long as any lrecs are pointing to them. One -// option is reference-counting which I experimented with; it was messy and -// error-prone. The approach used here is to keep a hash map from header-schema -// to header_keeper object. The current pheader_keeper is a pointer into one of -// those. Then when the reader is freed, all the header-keepers are freed. - -// ---------------------------------------------------------------- -#define STRING_BUILDER_INIT_SIZE 1024 -#define TERMIND_RS 0x1111 -#define TERMIND_FS 0x2222 -#define TERMIND_EOF 0x3333 - -typedef struct _field_wrapper_t { - char* contents; - int termind; -} field_wrapper_t; - -typedef struct _record_wrapper_t { - slls_t* contents; - int at_eof; -} record_wrapper_t; - -// ---------------------------------------------------------------- -typedef struct _lrec_reader_stdio_csv_state_t { - // Input line number is not the same as the record-counter in context_t, - // which counts records. - long long ilno; - - char* irs; - char* ifs; - char* dquote_irs; - char* dquote_ifs; - char* dquote_eof; - char* dquote; - char* dquote_dquote; - char* ifs_eof; - - int irs_len; - int ifs_len; - int dquote_irs_len; - int dquote_ifs_len; - int dquote_eof_len; - int dquote_len; - int dquote_dquote_len; - int ifs_eof_len; - - int peek_buf_len; - - string_builder_t sb; - string_builder_t* psb; - old_peek_file_reader_t* pfr; - - int expect_header_line_next; - header_keeper_t* pheader_keeper; - lhmslv_t* pheader_keepers; - -} lrec_reader_stdio_csv_state_t; - -// ---------------------------------------------------------------- -static record_wrapper_t lrec_reader_stdio_csv_get_record(lrec_reader_stdio_csv_state_t* pstate); - -static field_wrapper_t get_csv_field(lrec_reader_stdio_csv_state_t* pstate); -static field_wrapper_t get_csv_field_not_dquoted(lrec_reader_stdio_csv_state_t* pstate); -static field_wrapper_t get_csv_field_dquoted(lrec_reader_stdio_csv_state_t* pstate); -static lrec_t* paste_header_and_data(lrec_reader_stdio_csv_state_t* pstate, slls_t* pdata_fields); - -// ---------------------------------------------------------------- -// xxx needs abend on null lhs. etc. - -static lrec_t* lrec_reader_stdio_csv_process(void* pvstate, void* pvhandle, context_t* pctx) { - lrec_reader_stdio_csv_state_t* pstate = pvstate; - if (pstate->pfr == NULL) { - pstate->pfr = old_pfr_alloc((FILE*)pvhandle, pstate->peek_buf_len); - } - - record_wrapper_t rwrapper; - - if (pstate->expect_header_line_next) { - rwrapper = lrec_reader_stdio_csv_get_record(pstate); - - if (rwrapper.contents == NULL && rwrapper.at_eof) - return NULL; - pstate->ilno++; - - pstate->expect_header_line_next = FALSE; - - pstate->pheader_keeper = lhmslv_get(pstate->pheader_keepers, rwrapper.contents); - if (pstate->pheader_keeper == NULL) { - pstate->pheader_keeper = header_keeper_alloc(NULL, rwrapper.contents); - lhmslv_put(pstate->pheader_keepers, rwrapper.contents, pstate->pheader_keeper); - } else { // Re-use the header-keeper in the header cache - slls_free(rwrapper.contents); - } - } - - rwrapper = lrec_reader_stdio_csv_get_record(pstate); - if (rwrapper.contents == NULL && rwrapper.at_eof) - return NULL; - - pstate->ilno++; - return paste_header_and_data(pstate, rwrapper.contents); -} - -static record_wrapper_t lrec_reader_stdio_csv_get_record(lrec_reader_stdio_csv_state_t* pstate) { - slls_t* pfields = slls_alloc(); - record_wrapper_t rwrapper; - rwrapper.contents = pfields; - rwrapper.at_eof = FALSE; - while (TRUE) { - field_wrapper_t fwrapper = get_csv_field(pstate); - if (fwrapper.termind == TERMIND_EOF) - rwrapper.at_eof = TRUE; - if (fwrapper.contents != NULL) - slls_add_with_free(pfields, fwrapper.contents); - if (fwrapper.termind != TERMIND_FS) - break; - } - if (pfields->length == 0 && rwrapper.at_eof) { - slls_free(pfields); - rwrapper.contents = NULL; - } - return rwrapper; -} - -static field_wrapper_t get_csv_field(lrec_reader_stdio_csv_state_t* pstate) { - field_wrapper_t wrapper; - if (old_pfr_at_eof(pstate->pfr)) { - wrapper.contents = NULL; - wrapper.termind = TERMIND_EOF; - return wrapper; - } else if (old_pfr_next_is(pstate->pfr, pstate->dquote, pstate->dquote_len)) { - old_pfr_advance_by(pstate->pfr, pstate->dquote_len); - return get_csv_field_dquoted(pstate); - } else { - return get_csv_field_not_dquoted(pstate); - } -} - -static field_wrapper_t get_csv_field_not_dquoted(lrec_reader_stdio_csv_state_t* pstate) { - while (TRUE) { - if (old_pfr_at_eof(pstate->pfr)) { - return (field_wrapper_t) { - .contents = sb_is_empty(pstate->psb) ? NULL: sb_finish(pstate->psb), - .termind = TERMIND_EOF - }; - } else if (old_pfr_next_is(pstate->pfr, pstate->ifs_eof, pstate->ifs_eof_len)) { - old_pfr_advance_by(pstate->pfr, pstate->ifs_eof_len); - return (field_wrapper_t) { .contents = sb_finish(pstate->psb), .termind = TERMIND_EOF }; - } else if (old_pfr_next_is(pstate->pfr, pstate->ifs, pstate->ifs_len)) { - old_pfr_advance_by(pstate->pfr, pstate->ifs_len); - return (field_wrapper_t) { .contents = sb_finish(pstate->psb), .termind = TERMIND_FS }; - } else if (old_pfr_next_is(pstate->pfr, pstate->irs, pstate->irs_len)) { - old_pfr_advance_by(pstate->pfr, pstate->irs_len); - return (field_wrapper_t) { .contents = sb_finish(pstate->psb), .termind = TERMIND_RS }; - } else if (old_pfr_next_is(pstate->pfr, pstate->dquote, pstate->dquote_len)) { - fprintf(stderr, "%s: non-compliant field-internal double-quote at line %lld.\n", - MLR_GLOBALS.argv0, pstate->ilno); - exit(1); - } else { - sb_append_char(pstate->psb, old_pfr_read_char(pstate->pfr)); - } - } -} - -static field_wrapper_t get_csv_field_dquoted(lrec_reader_stdio_csv_state_t* pstate) { - while (TRUE) { - if (old_pfr_at_eof(pstate->pfr)) { - fprintf(stderr, "%s: imbalanced double-quote at line %lld.\n", MLR_GLOBALS.argv0, pstate->ilno); - exit(1); - } else if (old_pfr_next_is(pstate->pfr, pstate->dquote_eof, pstate->dquote_eof_len)) { - old_pfr_advance_by(pstate->pfr, pstate->dquote_eof_len); - return (field_wrapper_t) { .contents = sb_finish(pstate->psb), .termind = TERMIND_EOF }; - } else if (old_pfr_next_is(pstate->pfr, pstate->dquote_ifs, pstate->dquote_ifs_len)) { - old_pfr_advance_by(pstate->pfr, pstate->dquote_ifs_len); - return (field_wrapper_t) { .contents = sb_finish(pstate->psb), .termind = TERMIND_FS }; - } else if (old_pfr_next_is(pstate->pfr, pstate->dquote_irs, pstate->dquote_irs_len)) { - old_pfr_advance_by(pstate->pfr, pstate->dquote_irs_len); - return (field_wrapper_t) { .contents = sb_finish(pstate->psb), .termind = TERMIND_RS }; - } else if (old_pfr_next_is(pstate->pfr, pstate->dquote_dquote, pstate->dquote_dquote_len)) { - // "" inside a dquoted field is an escape for " - old_pfr_advance_by(pstate->pfr, pstate->dquote_dquote_len); - sb_append_string(pstate->psb, pstate->dquote); - } else { - sb_append_char(pstate->psb, old_pfr_read_char(pstate->pfr)); - } - } -} - -static lrec_t* paste_header_and_data(lrec_reader_stdio_csv_state_t* pstate, slls_t* pdata_fields) { - if (pstate->pheader_keeper->pkeys->length != pdata_fields->length) { - fprintf(stderr, "%s: Header/data length mismatch: %d != %d at line %lld.\n", - MLR_GLOBALS.argv0, pstate->pheader_keeper->pkeys->length, pdata_fields->length, pstate->ilno); - exit(1); - } - lrec_t* prec = lrec_unbacked_alloc(); - sllse_t* ph = pstate->pheader_keeper->pkeys->phead; - sllse_t* pd = pdata_fields->phead; - for ( ; ph != NULL && pd != NULL; ph = ph->pnext, pd = pd->pnext) { - lrec_put_no_free(prec, ph->value, pd->value); - } - return prec; -} - -// ---------------------------------------------------------------- -static void lrec_reader_stdio_csv_sof(void* pvstate) { - lrec_reader_stdio_csv_state_t* pstate = pvstate; - pstate->ilno = 0LL; - pstate->expect_header_line_next = TRUE; - pstate->pfr = NULL; -} - -// ---------------------------------------------------------------- -static void lrec_reader_stdio_csv_free(void* pvstate) { - lrec_reader_stdio_csv_state_t* pstate = pvstate; - for (lhmslve_t* pe = pstate->pheader_keepers->phead; pe != NULL; pe = pe->pnext) { - header_keeper_t* pheader_keeper = pe->pvvalue; - header_keeper_free(pheader_keeper); - } - old_pfr_free(pstate->pfr); -} - -// ---------------------------------------------------------------- -lrec_reader_t* lrec_reader_stdio_csv_alloc(char irs, char ifs) { - lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t)); - - lrec_reader_stdio_csv_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_stdio_csv_state_t)); - pstate->ilno = 0LL; - pstate->irs = "\r\n"; // xxx multi-byte the cli irs/ifs/etc, and integrate here - pstate->ifs = ","; // xxx multi-byte the cli irs/ifs/etc, and integrate here - - pstate->dquote_irs = mlr_paste_2_strings("\"", pstate->irs); - pstate->dquote_ifs = mlr_paste_2_strings("\"", pstate->ifs); - pstate->dquote_eof = "\"\xff"; - pstate->dquote = "\""; - pstate->dquote_dquote = "\"\""; - pstate->ifs_eof = mlr_paste_2_strings(pstate->ifs, "\xff"); - - pstate->irs_len = strlen(pstate->irs); - pstate->ifs_len = strlen(pstate->ifs); - pstate->dquote_irs_len = strlen(pstate->dquote_irs); - pstate->dquote_ifs_len = strlen(pstate->dquote_ifs); - pstate->dquote_eof_len = strlen(pstate->dquote_eof); - pstate->dquote_len = strlen(pstate->dquote); - pstate->dquote_dquote_len = strlen(pstate->dquote_dquote); - pstate->ifs_eof_len = strlen(pstate->ifs_eof); - - pstate->peek_buf_len = pstate->irs_len; - pstate->peek_buf_len = mlr_imax2(pstate->peek_buf_len, pstate->ifs_len); - pstate->peek_buf_len = mlr_imax2(pstate->peek_buf_len, pstate->dquote_irs_len); - pstate->peek_buf_len = mlr_imax2(pstate->peek_buf_len, pstate->dquote_ifs_len); - pstate->peek_buf_len = mlr_imax2(pstate->peek_buf_len, pstate->dquote_eof_len); - pstate->peek_buf_len = mlr_imax2(pstate->peek_buf_len, pstate->dquote_len); - pstate->peek_buf_len = mlr_imax2(pstate->peek_buf_len, pstate->dquote_dquote_len); - pstate->peek_buf_len = mlr_imax2(pstate->peek_buf_len, pstate->ifs_eof_len); - pstate->peek_buf_len += 2; - - sb_init(&pstate->sb, STRING_BUILDER_INIT_SIZE); - pstate->psb = &pstate->sb; - pstate->pfr = NULL; - - pstate->expect_header_line_next = TRUE; - pstate->pheader_keeper = NULL; - pstate->pheader_keepers = lhmslv_alloc(); - - plrec_reader->pvstate = (void*)pstate; - plrec_reader->popen_func = &file_reader_stdio_vopen; - plrec_reader->pclose_func = &file_reader_stdio_vclose; - plrec_reader->pprocess_func = &lrec_reader_stdio_csv_process; - plrec_reader->psof_func = &lrec_reader_stdio_csv_sof; - plrec_reader->pfree_func = &lrec_reader_stdio_csv_free; - - return plrec_reader; -} diff --git a/c/input/lrec_readers.c b/c/input/lrec_readers.c index 3efffedb15..5d72f47f5b 100644 --- a/c/input/lrec_readers.c +++ b/c/input/lrec_readers.c @@ -14,14 +14,12 @@ lrec_reader_t* lrec_reader_alloc(char* fmtdesc, int use_mmap, char irs, char if else return lrec_reader_stdio_dkvp_alloc(irs, ifs, ips, allow_repeat_ifs); } else if (streq(fmtdesc, "csv")) { - return lrec_reader_stdio_csv_alloc(irs, ifs); + return lrec_reader_csv_alloc(pbr, irs, ifs); } else if (streq(fmtdesc, "csvlite")) { if (use_mmap) return lrec_reader_mmap_csvlite_alloc(irs, ifs, allow_repeat_ifs); else return lrec_reader_stdio_csvlite_alloc(irs, ifs, allow_repeat_ifs); - } else if (streq(fmtdesc, "csvex")) { - return lrec_reader_csvex_alloc(pbr, irs, ifs); } else if (streq(fmtdesc, "nidx")) { if (use_mmap) return lrec_reader_mmap_nidx_alloc(irs, ifs, allow_repeat_ifs); diff --git a/c/input/lrec_readers.h b/c/input/lrec_readers.h index ea04e63183..c793492aa3 100644 --- a/c/input/lrec_readers.h +++ b/c/input/lrec_readers.h @@ -10,15 +10,14 @@ lrec_reader_t* lrec_reader_alloc(char* fmtdesc, int use_mmap, char irs, char ifs, int allow_repeat_ifs, char ips, int allow_repeat_ips); -lrec_reader_t* lrec_reader_stdio_csv_alloc(char irs, char ifs); lrec_reader_t* lrec_reader_stdio_csvlite_alloc(char irs, char ifs, int allow_repeat_ifs); -lrec_reader_t* lrec_reader_csvex_alloc(byte_reader_t* pbr, char irs, char ifs); +lrec_reader_t* lrec_reader_csv_alloc(byte_reader_t* pbr, char irs, char ifs); lrec_reader_t* lrec_reader_stdio_dkvp_alloc(char irs, char ifs, char ips, int allow_repeat_ifs); lrec_reader_t* lrec_reader_stdio_nidx_alloc(char irs, char ifs, int allow_repeat_ifs); lrec_reader_t* lrec_reader_stdio_xtab_alloc(char ips, int allow_repeat_ips); -lrec_reader_t* lrec_reader_mmap_csv_alloc(char irs, char ifs, int allow_repeat_ifs); -lrec_reader_t* lrec_reader_mmap_csvlite_alloc(char irs, char ifs, int allow_repeat_ifs); +lrec_reader_t* lrec_reader_mmap_csv_alloc(char irs, char ifs, int allow_repeat_ifs); +lrec_reader_t* lrec_reader_mmap_csvlite_alloc(char irs, char ifs, int allow_repeat_ifs); lrec_reader_t* lrec_reader_mmap_dkvp_alloc(char irs, char ifs, char ips, int allow_repeat_ifs); lrec_reader_t* lrec_reader_mmap_nidx_alloc(char irs, char ifs, int allow_repeat_ifs); lrec_reader_t* lrec_reader_mmap_xtab_alloc(char irs, char ips, int allow_repeat_ips); @@ -33,8 +32,6 @@ slls_t* split_csv_header_line(char* line, char ifs, int allow_repeat_ifs); slls_t* split_csvlite_header_line(char* line, char ifs, int allow_repeat_ifs); lrec_t* lrec_parse_stdio_csvlite_data_line(header_keeper_t* pheader_keeper, char* data_line, char ifs, int allow_repeat_ifs); -lrec_t* lrec_parse_stdio_csv_data_line(header_keeper_t* pheader_keeper, char* data_line, char ifs, - int allow_repeat_ifs); lrec_t* lrec_parse_stdio_xtab(slls_t* pxtab_lines, char ips, int allow_repeat_ips); lrec_t* lrec_parse_mmap_nidx(file_reader_mmap_state_t* phandle, char irs, char ifs, int allow_repeat_ifs); diff --git a/c/input/old_peek_file_reader.c b/c/input/old_peek_file_reader.c deleted file mode 100644 index f8c6371a7a..0000000000 --- a/c/input/old_peek_file_reader.c +++ /dev/null @@ -1,89 +0,0 @@ -#include -#include -#include "lib/mlrutil.h" -#include "lib/mlr_globals.h" -#include "old_peek_file_reader.h" - -// xxx comment about efficiency here: enough to deliver rfc-csv feature with performance tuning still tbd. - -// tripartite ascii art w/ chars-to-caller; the peekbuf; pending data in the fp. -// label in particular eof handling. - -// ---------------------------------------------------------------- -old_peek_file_reader_t* old_pfr_alloc(FILE* fp, int maxnpeek) { - old_peek_file_reader_t* pfr = mlr_malloc_or_die(sizeof(old_peek_file_reader_t)); - pfr->fp = fp; - pfr->peekbuflen = maxnpeek + 1; - pfr->peekbuf = mlr_malloc_or_die(pfr->peekbuflen); - memset(pfr->peekbuf, 0, pfr->peekbuflen); - pfr->npeeked = 0; - - // Pre-read one char into the peekbuf so that we can say old_pfr_at_eof - // right away on the first call on an empty file. - // getc_unlocked() is appropriate since Miller is single-threaded. - pfr->peekbuf[pfr->npeeked++] = getc_unlocked(pfr->fp); // maybe EOF - - return pfr; -} - -// ---------------------------------------------------------------- -int old_pfr_at_eof(old_peek_file_reader_t* pfr) { - return pfr->npeeked >= 1 && pfr->peekbuf[0] == EOF; -} - -// ---------------------------------------------------------------- -// xxx inline this for perf. -int old_pfr_next_is(old_peek_file_reader_t* pfr, char* string, int len) { - // xxx abend on len > peekbuflen - while (pfr->npeeked < len) { - char c = getc_unlocked(pfr->fp); // maybe EOF - pfr->peekbuf[pfr->npeeked++] = c; - } - // xxx make a memeq, inlined. - return memcmp(string, pfr->peekbuf, len) == 0; -} - -// ---------------------------------------------------------------- -char old_pfr_read_char(old_peek_file_reader_t* pfr) { - if (pfr->npeeked == 1 && pfr->peekbuf[0] == EOF) { - return EOF; - } else if (pfr->npeeked == 0) { - pfr->peekbuf[0] = getc_unlocked(pfr->fp); // maybe EOF - pfr->npeeked = 1; - return pfr->peekbuf[0]; - } else { - char c = pfr->peekbuf[0]; - pfr->npeeked--; - for (int i = 0; i < pfr->npeeked; i++) - pfr->peekbuf[i] = pfr->peekbuf[i+1]; - return c; - } -} - -// ---------------------------------------------------------------- -void old_pfr_advance_by(old_peek_file_reader_t* pfr, int len) { - for (int i = 0; i < len; i++) - old_pfr_read_char(pfr); -} - -// ---------------------------------------------------------------- -void old_pfr_free(old_peek_file_reader_t* pfr) { - if (pfr == NULL) - return; - free(pfr->peekbuf); - pfr->fp = NULL; - pfr->peekbuf = NULL; - free(pfr); -} - -// ---------------------------------------------------------------- -void old_pfr_dump(old_peek_file_reader_t* pfr) { - printf("======================== pfr at %p\n", pfr); - printf(" peekbuflen = %d\n", pfr->peekbuflen); - printf(" npeeked = %d\n", pfr->npeeked); - for (int i = 0; i < pfr->npeeked; i++) { - char c = pfr->peekbuf[i]; - printf(" i=%d c=%c [%02x]\n", i, isprint((unsigned char)c) ? c : ' ', c); - } - printf("------------------------\n"); -} diff --git a/c/input/old_peek_file_reader.h b/c/input/old_peek_file_reader.h deleted file mode 100644 index 85fbb07977..0000000000 --- a/c/input/old_peek_file_reader.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef OLD_PEEK_FILE_READER_H -#define OLD_PEEK_FILE_READER_H - -#include - -typedef struct _old_peek_file_reader_t { - FILE* fp; - int peekbuflen; - char* peekbuf; - int npeeked; -} old_peek_file_reader_t; - -// The caller should fclose the fp, since presumably it will have opened it. We -// could have our constructor do the fopen (taking not fp but filename as -// argument) and the destructor do the fclose but that would break reading from -// stdin. - -old_peek_file_reader_t* old_pfr_alloc(FILE* fp, int maxnpeek); -// xxx needing contextual comments here. -int old_pfr_at_eof(old_peek_file_reader_t* pfr); -int old_pfr_next_is(old_peek_file_reader_t* pfr, char* string, int len); -char old_pfr_read_char(old_peek_file_reader_t* pfr); -void old_pfr_advance_by(old_peek_file_reader_t* pfr, int len); -void old_pfr_free(old_peek_file_reader_t* pfr); - -void old_pfr_dump(old_peek_file_reader_t* pfr); - -#endif // OLD_PEEK_FILE_READER_H