Skip to content

Commit

Permalink
CSV read performance: fix #51
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Sep 8, 2015
1 parent 99b88c1 commit bc4d076
Show file tree
Hide file tree
Showing 10 changed files with 31 additions and 621 deletions.
7 changes: 1 addition & 6 deletions c/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ TEST_LREC_SRCS = lib/mlrutil.c lib/mlr_globals.c lib/string_builder.c \
containers/lrec.c containers/header_keeper.c containers/sllv.c \
containers/slls.c containers/lhmslv.c \
input/file_reader_mmap.c input/file_reader_stdio.c \
input/lrec_reader_stdio_csv.c \
input/lrec_reader_mmap_csvlite.c input/lrec_reader_stdio_csvlite.c \
input/old_peek_file_reader.c \
input/lrec_reader_mmap_dkvp.c input/lrec_reader_stdio_dkvp.c \
input/lrec_reader_mmap_nidx.c input/lrec_reader_stdio_nidx.c \
input/lrec_reader_mmap_xtab.c input/lrec_reader_stdio_xtab.c \
Expand All @@ -70,14 +68,12 @@ containers/join_bucket_keeper.c \
input/mmap_byte_reader.c \
input/stdio_byte_reader.c \
input/lrec_reader_in_memory.c input/lrec_readers.c \
input/lrec_reader_stdio_csv.c \
input/lrec_reader_csvex.c \
input/lrec_reader_csv.c \
input/lrec_reader_mmap_csvlite.c input/lrec_reader_stdio_csvlite.c \
input/lrec_reader_mmap_dkvp.c input/lrec_reader_stdio_dkvp.c \
input/lrec_reader_mmap_nidx.c input/lrec_reader_stdio_nidx.c \
input/lrec_reader_mmap_xtab.c input/lrec_reader_stdio_xtab.c \
input/file_reader_mmap.c input/file_reader_stdio.c \
input/old_peek_file_reader.c \
input/peek_file_reader.c \
containers/test_join_bucket_keeper.c

Expand All @@ -86,7 +82,6 @@ lib/mlrutil.c \
lib/mlr_globals.c \
lib/string_builder.c \
input/file_reader_mmap.c \
input/old_peek_file_reader.c \
experimental/getlines.c

# ================================================================
Expand Down
12 changes: 1 addition & 11 deletions c/cli/mlrcli.c
Original file line number Diff line number Diff line change
Expand Up @@ -288,24 +288,14 @@ cli_opts_t* parse_command_line(int argc, char** argv) {
argi++;
}

// ready for cutover! :)
// just leaving it off for a while to compare performance.
#if 0
else if (streq(argv[argi], "--csv")) { popts->ifmt = "csvex"; ofmt = "csv"; }
else if (streq(argv[argi], "--icsv")) { popts->ifmt = "csvex"; }
else if (streq(argv[argi], "--ocsv")) { ofmt = "csv"; }
#else
else if (streq(argv[argi], "--csv")) { popts->ifmt = ofmt = "csv"; }
else if (streq(argv[argi], "--icsv")) { popts->ifmt = "csv"; }
else if (streq(argv[argi], "--ocsv")) { ofmt = "csv"; }
#endif

else if (streq(argv[argi], "--csvlite")) { popts->ifmt = ofmt = "csvlite"; }
else if (streq(argv[argi], "--icsvlite")) { popts->ifmt = "csvlite"; }
else if (streq(argv[argi], "--ocsvlite")) { ofmt = "csvlite"; }

else if (streq(argv[argi], "--csvex")) { popts->ifmt = "csvex"; ofmt = "csv"; }
else if (streq(argv[argi], "--icsvex")) { popts->ifmt = "csvex"; }

else if (streq(argv[argi], "--dkvp")) { popts->ifmt = ofmt = "dkvp"; }
else if (streq(argv[argi], "--idkvp")) { popts->ifmt = "dkvp"; }
else if (streq(argv[argi], "--odkvp")) { ofmt = "dkvp"; }
Expand Down
140 changes: 0 additions & 140 deletions c/experimental/csv0.c

This file was deleted.

25 changes: 0 additions & 25 deletions c/experimental/pfr.c

This file was deleted.

50 changes: 25 additions & 25 deletions c/input/lrec_reader_csvex.c → c/input/lrec_reader_csv.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
#define DQUOTE_DQUOTE_STRIDX 0x2008

// ----------------------------------------------------------------
typedef struct _lrec_reader_csvex_state_t {
typedef struct _lrec_reader_csv_state_t {
// Input line number is not the same as the record-counter in context_t,
// which counts records.
long long ilno;
Expand Down Expand Up @@ -76,19 +76,19 @@ typedef struct _lrec_reader_csvex_state_t {
header_keeper_t* pheader_keeper;
lhmslv_t* pheader_keepers;

} lrec_reader_csvex_state_t;
} lrec_reader_csv_state_t;

static slls_t* lrec_reader_csvex_get_fields(lrec_reader_csvex_state_t* pstate);
static lrec_t* paste_header_and_data(lrec_reader_csvex_state_t* pstate, slls_t* pdata_fields);
static slls_t* lrec_reader_csv_get_fields(lrec_reader_csv_state_t* pstate);
static lrec_t* paste_header_and_data(lrec_reader_csv_state_t* pstate, slls_t* pdata_fields);

// ----------------------------------------------------------------
// xxx needs abend on null lhs. etc.

static lrec_t* lrec_reader_csvex_process(void* pvstate, void* pvhandle, context_t* pctx) {
lrec_reader_csvex_state_t* pstate = pvstate;
static lrec_t* lrec_reader_csv_process(void* pvstate, void* pvhandle, context_t* pctx) {
lrec_reader_csv_state_t* pstate = pvstate;

if (pstate->expect_header_line_next) {
slls_t* pheader_fields = lrec_reader_csvex_get_fields(pstate);
slls_t* pheader_fields = lrec_reader_csv_get_fields(pstate);
if (pheader_fields == NULL)
return NULL;
pstate->ilno++;
Expand All @@ -104,14 +104,14 @@ static lrec_t* lrec_reader_csvex_process(void* pvstate, void* pvhandle, context_
}
pstate->ilno++;

slls_t* pdata_fields = lrec_reader_csvex_get_fields(pstate);
slls_t* pdata_fields = lrec_reader_csv_get_fields(pstate);
if (pdata_fields == NULL) // EOF
return NULL;
else
return paste_header_and_data(pstate, pdata_fields);
}

static slls_t* lrec_reader_csvex_get_fields(lrec_reader_csvex_state_t* pstate) {
static slls_t* lrec_reader_csv_get_fields(lrec_reader_csv_state_t* pstate) {
int rc, stridx, matchlen, record_done, field_done;
peek_file_reader_t* pfr = pstate->pfr;
string_builder_t* psb = pstate->psb;
Expand Down Expand Up @@ -239,7 +239,7 @@ static slls_t* lrec_reader_csvex_get_fields(lrec_reader_csvex_state_t* pstate) {
}

// ----------------------------------------------------------------
static lrec_t* paste_header_and_data(lrec_reader_csvex_state_t* pstate, slls_t* pdata_fields) {
static lrec_t* paste_header_and_data(lrec_reader_csv_state_t* pstate, slls_t* pdata_fields) {
if (pstate->pheader_keeper->pkeys->length != pdata_fields->length) {
fprintf(stderr, "%s: Header/data length mismatch: %d != %d at line %lld.\n",
MLR_GLOBALS.argv0, pstate->pheader_keeper->pkeys->length, pdata_fields->length, pstate->ilno);
Expand All @@ -255,29 +255,29 @@ static lrec_t* paste_header_and_data(lrec_reader_csvex_state_t* pstate, slls_t*
}

// ----------------------------------------------------------------
void* lrec_reader_csvex_open(void* pvstate, char* filename) {
lrec_reader_csvex_state_t* pstate = pvstate;
void* lrec_reader_csv_open(void* pvstate, char* filename) {
lrec_reader_csv_state_t* pstate = pvstate;
pstate->pfr->pbr->popen_func(pstate->pfr->pbr, filename);
pfr_reset(pstate->pfr);
return NULL; // xxx modify the API after the functional refactor is complete
}

void lrec_reader_csvex_close(void* pvstate, void* pvhandle) {
lrec_reader_csvex_state_t* pstate = pvstate;
void lrec_reader_csv_close(void* pvstate, void* pvhandle) {
lrec_reader_csv_state_t* pstate = pvstate;
pstate->pfr->pbr->pclose_func(pstate->pfr->pbr);
}

// ----------------------------------------------------------------
// xxx after the pfr/pbr refactor is complete, vsof and vopen may be redundant.
static void lrec_reader_csvex_sof(void* pvstate) {
lrec_reader_csvex_state_t* pstate = pvstate;
static void lrec_reader_csv_sof(void* pvstate) {
lrec_reader_csv_state_t* pstate = pvstate;
pstate->ilno = 0LL;
pstate->expect_header_line_next = TRUE;
}

// ----------------------------------------------------------------
static void lrec_reader_csvex_free(void* pvstate) {
lrec_reader_csvex_state_t* pstate = pvstate;
static void lrec_reader_csv_free(void* pvstate) {
lrec_reader_csv_state_t* pstate = pvstate;
for (lhmslve_t* pe = pstate->pheader_keepers->phead; pe != NULL; pe = pe->pnext) {
header_keeper_t* pheader_keeper = pe->pvvalue;
header_keeper_free(pheader_keeper);
Expand All @@ -286,10 +286,10 @@ static void lrec_reader_csvex_free(void* pvstate) {
}

// ----------------------------------------------------------------
lrec_reader_t* lrec_reader_csvex_alloc(byte_reader_t* pbr, char irs, char ifs) {
lrec_reader_t* lrec_reader_csv_alloc(byte_reader_t* pbr, char irs, char ifs) {
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));

lrec_reader_csvex_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_csvex_state_t));
lrec_reader_csv_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_csv_state_t));
pstate->ilno = 0LL;

pstate->eof = "\xff";
Expand Down Expand Up @@ -349,11 +349,11 @@ lrec_reader_t* lrec_reader_csvex_alloc(byte_reader_t* pbr, char irs, char ifs) {
pstate->pheader_keepers = lhmslv_alloc();

plrec_reader->pvstate = (void*)pstate;
plrec_reader->popen_func = &lrec_reader_csvex_open;
plrec_reader->pclose_func = &lrec_reader_csvex_close;
plrec_reader->pprocess_func = &lrec_reader_csvex_process;
plrec_reader->psof_func = &lrec_reader_csvex_sof;
plrec_reader->pfree_func = &lrec_reader_csvex_free;
plrec_reader->popen_func = &lrec_reader_csv_open;
plrec_reader->pclose_func = &lrec_reader_csv_close;
plrec_reader->pprocess_func = &lrec_reader_csv_process;
plrec_reader->psof_func = &lrec_reader_csv_sof;
plrec_reader->pfree_func = &lrec_reader_csv_free;

return plrec_reader;
}
Loading

0 comments on commit bc4d076

Please sign in to comment.