Skip to content

[ENG-1809] Improve .csv dialect sniffing #362

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 83 additions & 5 deletions mfr/extensions/tabular/libs/stdlib_tools.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,47 @@
import re
import csv
import logging

from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
from mfr.extensions.tabular import utilities
from mfr.extensions.tabular.settings import MAX_FILE_SIZE, INIT_SNIFF_SIZE
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError

logger = logging.getLogger(__name__)


def csv_stdlib(fp):
"""Read and convert a csv file to JSON format using the python standard library
:param fp: File pointer object
:return: tuple of table headers and data

Quirk: ``csv.Sniffer().sniff()`` needs the FULL first row and ONLY one full row to be able to
effectively detect the correct dialect of the file.

:param fp: the file pointer object
:return: a tuple of table headers and data
"""
data = fp.read(2048)

# Prepare the first row for sniffing
data = fp.read(INIT_SNIFF_SIZE)
data = _trim_or_append_data(fp, data, INIT_SNIFF_SIZE, 0)
Comment on lines +22 to +24
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The initial solution (see below) didn't work 100% since for some file, sniffing the full file ended up in wrong delimiter while sniffing only the first row worked as expected. This is why even when the sniffer can sniff the full file, MFR only provides it with the first row.

...
data = fp.read(TABULAR_INIT_SNIFF_SIZE)
if len(data) == TABULAR_INIT_SNIFF_SIZE:
    data = _trim_or_append_data(fp, data, TABULAR_INIT_SNIFF_SIZE, 0)
...


# Reset the file pointer
fp.seek(0)

# Sniff the first row to find a matching format
try:
dialect = csv.Sniffer().sniff(data)
except csv.Error:
dialect = csv.excel
else:
_set_dialect_quote_attrs(dialect, data)

# Explicitly delete data when it is on longer used.
del data

# Create the CSV reader with the detected dialect
reader = csv.DictReader(fp, dialect=dialect)

# Update the reader field names to avoid duplicate column names when performing row extraction
columns = []
# update the reader field names to avoid duplicate column names when performing row extraction
for idx, fieldname in enumerate(reader.fieldnames or []):
column_count = sum(1 for column in columns if fieldname == column['name'])
if column_count:
Expand Down Expand Up @@ -92,3 +110,63 @@ def _set_dialect_quote_attrs(dialect, data):
dialect.quotechar = '"'
if re.search('"""[[({]\'.+\',', data):
dialect.doublequote = True


def _trim_or_append_data(fp, text, read_size, size_to_sniff, max_render_size=MAX_FILE_SIZE):
"""Recursively read data from a file and return its full first row. The file starts with
``text`` and the file pointer points to the next character immediately after `text`.

:param fp: the file pointer from which data is read
:param text: the current text chunk to check the new line character
:param read_size: the last read size when `fp.read()` is called
:param size_to_sniff: the accumulated size fo the text to sniff
:param max_render_size: the max file size for render
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of using MAX_FILE_SIZE directly, the max_render_size= provides an option for unit tests to use a much smaller size.

:return: the first row of the file in string
"""

# Return on empty text. This handles the corner case where the CSV is empty or only contains
# one line without any new line characters.
if len(text) == 0:
return ''

# Try to find the first new line character in the text chunk
index = _find_new_line(text)
# If found, return the trimmed substring
if index != -1:
return text[:index]
# Otherwise, update `sniff_size` and then sniff more (2 times of the last `read_size`) text
size_to_sniff += read_size
read_size *= 2
more_text = fp.read(read_size)

# If text to sniff now goes over the max file size limit, raise the renderer error since there
# is no need to sniff when the file is already too large to be rendered.
if size_to_sniff + len(more_text) >= max_render_size:
raise TabularRendererError(
'The first row of this file is too large for the sniffer to detect the dialect. '
'Please download and view it locally.',
code=400,
extension='csv'
)
# If the size is still within the limit, recursively check `more_text`
return text + _trim_or_append_data(fp, more_text, read_size, size_to_sniff,
max_render_size=max_render_size)


def _find_new_line(text):
"""In the given text string, find the index of the first occurrence of any of the three types
of new line character. Note: '\n\r' is not a new line character but two, one LF and one CR.

1. \r\n Carriage Return (CR) and Line Feed (LF), must be checked first
2. \n LF
3. \r CR

:param text: the text string to check
:return: the index of the first new line character if found. Otherwise, return -1.
"""
index = text.find('\r\n')
if index == -1:
index = text.find('\n')
if index == -1:
index = text.find('\r')
Comment on lines +167 to +171
Copy link
Contributor Author

@cslzchen cslzchen Jun 5, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure why .rfind() was used in the initial solution. One guess is that it only reads a fixed size for sniff so it wants to include as many rows as possible. However, this is no longer the case in my implementation where only the first row is sniffed.

    index = text.rfind('\r\n')
    if index == -1:
        index = text.rfind('\n')
        if index == -1:
            index = text.rfind('\r')

return index
4 changes: 3 additions & 1 deletion mfr/extensions/tabular/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

config = settings.child('TABULAR_EXTENSION_CONFIG')

MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10Mb
MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10MB
MAX_SIZE = int(config.get('MAX_SIZE', 10000)) # max number of rows or columns allowed.
TABLE_WIDTH = int(config.get('TABLE_WIDTH', 700))
TABLE_HEIGHT = int(config.get('TABLE_HEIGHT', 600))

INIT_SNIFF_SIZE = int(config.get('INIT_SNIFF_SIZE', 2 * 1024)) # 2KB

LIBS = config.get_object('LIBS', {
'.csv': [libs.csv_stdlib],
'.tsv': [libs.csv_stdlib],
Expand Down
1 change: 1 addition & 0 deletions tests/extensions/tabular/files/long_row.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
paper_id,pdf_filename,original_materials_link,original_poweranalysis_link,coded_claim3b,coded_claim4,original_statistic_analysis_type,original_samplesize_calculation_contributor,original_analytic_sample_size_units_reported,original_analytic_sample_size_value_reported,original_statistic_fulltext_reported,original_statistic_effect_type_reported,original_statistic_type_reported,original_statistic_value_reported,original_statistic_df1_reported,original_statistic_df2_reported,original_coefficient_type_reported,original_coefficient_value_reported,original_coefficient_se_reported,original_p_value_type_reported,original_p_value_tails_reported,original_p_value_value_reported,original_effect_size_fulltext_reported,original_effect_size_type_reported,original_effect_size_value_reported,original_analytic_sample_size_value_reference,original_statistic_fulltext_reference,original_statistic_effect_type_reference,original_statistic_type_reference,original_statistic_value_reference,original_statistic_df1_reference,original_statistic_df2_reference,original_coefficient_type_reference,original_coefficient_value_reference,original_coefficient_se_reference,original_p_value_tails_reference,original_p_value_value_reference,original_effect_size_type_reference,original_effect_size_value_reference,rr_stage1_analytic_sample_size,rr_stage2_analytic_sample_size,original_poweranalysis_notes,original_statsteam_contributor,original_es_ub_ci_nativeunits,original_es_lb_ci_nativeunits,original_pearsons_r_defined,original_pearsons_r_numeric,original_es_ub_ci_pearson,original_es_lb_ci_pearson,original_power_small,original_power_medium,original_power_50_original_effect,original_power_75_original_effect,original_statsteam_notes,to_do,priority,stats_rownumeOQm,Mason_JournExPsychGen_2012_eOQm,https://osf.io/zmdyg,https://osf.io/42e5k/,"a 2 (block: progressive, stagnant) X 2 (scale valence: positive, negative) repeated-measures ANOVA on participants’ responses to the positive and negative affect scales following the main experimental exposure. The critical manipulation was between lists of words constructed as either stagnant (a target word, and 11 items with a high probability of being mentioned by previous participants as a response to that word, e.g”lettuce, tomato, green, vegetable…“, or progressive (which were constructed by choosing words across association matrices, e.g.”salt, pepper, cat, dog…“)","A 2 (block: progressive, stagnant) X 2 (scale valence: positive, negative) repeated-measures ANOVA on participants’ mood ratings yielded a significant interaction, F(1, 76) = 6.63, p = .01, partial eta squared = .08",ANOVA,Melissa Kline,participants,77,"F(1, 76) = 6.63",interaction,F,6.63,1,76,NC,NC,NC,exact,NC,0.01,partial eta squared = .08,partial_eta_squared,0.08,,"F(1, 76) = 6.63",interaction,F,6.63,1,76,,,,,0.01,cohen_f_squared,0.0872368,218,485,"This is the first 'extended' original-paper stats set, used to test the workflow. As such, we hope it is a straightforward, within-subjects ANOVA! Note that the power calculations were performed on the statistics as reported, so these (a) are copied over into the 'recalculated' fields since they are the values used in the power analysis and (b) they have not actually been checked for internal consistency.",,,,,,,,,,,,,to_review,1,1z106,Pappu_JournAcaMarkSci_2014_yDyG_z106,https://osf.io/3urx9/?view_only=e0dda2b5a95540c4b053dfd87752574c,https://osf.io/hmbzp/,"We tested the hypotheses using structural equation modeling. The predictors were the sponsor (dummy coded), sponsorship relationship fit, sponsor-sponsee similarity, and their interaction. The outcome variable was participant intention to support the nonprofit, and the mediating variables were nonprofit clarity of positioning and the participant�s attitude toward the sponsorship | In Model B, in addition to estimating the main effects, we estimated a fit by similarity interaction on intent to support the nonprofit, through the mediator of clarity of positioning.","As predicted, fit and similarity jointly affected participants� intention to support the nonprofit (b = .10, p = .019) via clarity of positioning, as seen in Model B",SEM,Sam Field,participants,189,"b = .10, p = .019",interaction,NC,NC,NC,NC,NC,0.1,NC,exact,NC,0.019,NC,NC,NC,189,"coefficient = 0.10, p = 0.019",,,,,,path coefficient,0.1,0.04822441,one-tailed,0.019,,,821,1840,This replication has finished data collection; RR statistics aren't available yet but expected to be soon,,,,,,,,,,,,,to_do,1,2R8RN,Usta_JournMarketRes_2011_R8RN,https://osf.io/ks7ej/?view_only=5115343bd65c42639eb1fa8606d673d7,https://osf.io/mknax/,"Participants were randomly assigned to one of two experimental conditions: They were reminded of either a past decision that they delegated or one that they made independently, in both conditions by being asked to briefly describe the decision and then describe the thoughts and feelings that experiencing the decision caused them to have. Participants were also shown pictures of two desserts (a chocolate cake and a fruit salad) and asked to indicate which one they would eat if they had the two desserts in front of them right then. The measure of self-regulatory resource depletion was participants’ choice of dessert, which was analyzed with a logistic regression.","A logistic regression analysis showed that the choice share of the chocolate cake among those who had been instructed to recall a delegated decision (70.6%) was significantly greater than among participants who had been asked to recall an independent decision (31.6%; χ2(1, 36) = 5.46, p < .03)",logistic regression,Chris Aberson,participants,36,"chi squared(1, 36) = 5.46, p < .03",main effect,chi-square,5.46,1,36,NC,NC,NC,less-than,NC,0.03,NC,NC,NC,,,,,,,,,,,two-tailed,,log_odds_ratio,1.648251,119,263,"A regression powered using log odds; sample size projections conducted by Chris A. using Gpower",,,,,,,,,,,,,to_do,2,3GXEW,Srinivasan_Cognition_2010_GXEW,https://osf.io/acmz2/?view_only=79221d82b98146808721d4d43c5e2ea3,https://osf.io/6gtwn/,"A 2 X 2 X 2 mixed-factor repeated-measures ANOVA examined the effects of the between-subjects factor of group (congruent or incongruent tone/line pairings), and the within-subjects factors of test trial type (novel or familiar) and difficulty (easy or difficult) on the participants� familiarity ratings","A mixed-factor repeated-measures ANOVA on Experiment 3 participants� ratings of item familiarity (with factors for congruent/incongruent condition group, novel vs.�familiar trial type, and easy vs.�difficult trial type) revealed a �group X novelty interaction, F(1, 32) = 4.38,p < .05, indicating that participants in the congruent group better distinguished the familiar from novel pairings than participants in the incongruent group",ANOVA,NC,participants,37,"F(1, 32) = 4.38,p < .05",interaction,F,4.38,1,32,NC,NC,NC,less-than,NC,0.05,NC,NC,NC,34,"F(1,32) = 4.38",,F,4.38,1,32,,,,,,,0.136875,141,311,"A more complex ANOVA case; stats team please review the power analysis to get additional 'reference' values - here and in other cases, these are left blank if I didn't feel certain of what something was or how calculated",,,,,,,,,,,,,to_do,2,4
Expand Down
1 change: 1 addition & 0 deletions tests/extensions/tabular/files/one_line.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
one, line, csv
34 changes: 30 additions & 4 deletions tests/extensions/tabular/test_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,18 @@ def test_csv_file_path():


@pytest.fixture
def invalid_csv_file_path():
return os.path.join(PATH, 'files', 'invalid.csv')
def empty_csv_file_path():
return os.path.join(PATH, 'files', 'empty.csv')


@pytest.fixture
def one_line_csv_file_path():
return os.path.join(PATH, 'files', 'one_line.csv')


@pytest.fixture
def long_row_csv_file_path():
return os.path.join(PATH, 'files', 'long_row.csv')


@pytest.fixture
Expand Down Expand Up @@ -107,10 +117,26 @@ def test_render_tabular_csv(self, test_csv_file_path, assets_url, export_url):
body = renderer.render()
assert BODY in body

def test_render_tabular_csv_invalid(self, invalid_csv_file_path, assets_url, export_url):
def test_render_tabular_csv_one_line(self, one_line_csv_file_path, assets_url, export_url):
metadata = ProviderMetadata('test', '.csv', 'text/plain', '1234',
'http://wb.osf.io/file/test.csv?token=1234')
renderer = TabularRenderer(metadata, one_line_csv_file_path, url, assets_url, export_url)
body = renderer.render()
assert BODY in body

def test_render_tabular_csv_long_row(self, long_row_csv_file_path, assets_url, export_url):
metadata = ProviderMetadata('test', '.csv', 'text/plain', '1234',
'http://wb.osf.io/file/test.csv?token=1234')
renderer = TabularRenderer(metadata, long_row_csv_file_path, url, assets_url, export_url)
body = renderer.render()
assert BODY in body

# There is way to test an "invalid" CSV unless the file is empty since the CSV sniffer attempts
# to make the best guess syntactically even if the file is semantically invalid.
def test_render_tabular_csv_empty(self, empty_csv_file_path, assets_url, export_url):
metadata = ProviderMetadata('test', '.csv', 'text/plain', '1234',
'http://wb.osf.io/file/test.csv?token=1234')
renderer = TabularRenderer(metadata, invalid_csv_file_path, url, assets_url, export_url)
renderer = TabularRenderer(metadata, empty_csv_file_path, url, assets_url, export_url)
with pytest.raises(exceptions.EmptyTableError):
renderer.render()

Expand Down
132 changes: 132 additions & 0 deletions tests/extensions/tabular/test_stdlib_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import logging
from io import StringIO
from random import choice
from string import ascii_letters

import pytest

from mfr.extensions.tabular.exceptions import TabularRendererError
from mfr.extensions.tabular.libs.stdlib_tools import _find_new_line, _trim_or_append_data

logger = logging.getLogger(__name__)

INIT_SNIFF_SIZE = 128
MAX_RENDER_SIZE = INIT_SNIFF_SIZE * 8


@pytest.fixture
def text_with_lf():
return 'text_with_lf\nanother_row\na_third_row'


@pytest.fixture
def text_with_cr():
return 'text_with_cr\ranother_row\ra_third_row'


@pytest.fixture
def text_with_cr_lf():
return 'text_with_cr_lf\r\nanother_row\r\na_third_row'


@pytest.fixture
def text_without_new_line():
return 'text_without_new_line\tthe_same_row\tthe_same_row_continued'


@pytest.fixture
def small_text_partial():
return ''.join(choice(ascii_letters) for _ in range(INIT_SNIFF_SIZE - 2))


@pytest.fixture
def fp_small(small_text_partial):
return StringIO('{}\nanother_row\n'.format(small_text_partial))


@pytest.fixture
def large_text_partial():
return ''.join(choice(ascii_letters) for _ in range(MAX_RENDER_SIZE - INIT_SNIFF_SIZE))


@pytest.fixture
def fp_large(large_text_partial):
return StringIO('{}\nanother_row\n'.format(large_text_partial))


@pytest.fixture
def fp_empty():
return StringIO('')


@pytest.fixture
def one_line_text():
return ''.join(choice(ascii_letters) for _ in range(MAX_RENDER_SIZE - INIT_SNIFF_SIZE))


@pytest.fixture
def fp_one_line(one_line_text):
return StringIO(one_line_text)


@pytest.fixture
def fp_oversize():
oversize_text_partial = ''.join(choice(ascii_letters) for _ in range(MAX_RENDER_SIZE + 2))
return StringIO('{}the_same_row\nanother_row\n'.format(oversize_text_partial))


class TestFindNewLine:

def test_find_new_line_lf(self, text_with_lf):
index = _find_new_line(text_with_lf)
assert index == 12

def test_find_new_line_cr(self, text_with_cr):
index = _find_new_line(text_with_cr)
assert index == 12

def test_find_new_line_cr_lf(self, text_with_cr_lf):
index = _find_new_line(text_with_cr_lf)
assert index == 15

def test_find_new_line_none(self, text_without_new_line):
index = _find_new_line(text_without_new_line)
assert index == -1


class TestTrimORAppendData:

def test_trim_or_append_data_small(self, fp_small, small_text_partial):
data = fp_small.read(INIT_SNIFF_SIZE)
data = _trim_or_append_data(fp_small, data, INIT_SNIFF_SIZE, 0,
max_render_size=MAX_RENDER_SIZE)
fp_small.close()
assert data == small_text_partial

def test_trim_or_append_data_large(self, fp_large, large_text_partial):
data = fp_large.read(INIT_SNIFF_SIZE)
data = _trim_or_append_data(fp_large, data, INIT_SNIFF_SIZE, 0,
max_render_size=MAX_RENDER_SIZE)
fp_large.close()
assert data == large_text_partial

def test_trim_or_append_data_empty(self, fp_empty):
data = fp_empty.read(INIT_SNIFF_SIZE)
data = _trim_or_append_data(fp_empty, data, INIT_SNIFF_SIZE, 0,
max_render_size=MAX_RENDER_SIZE)
fp_empty.close()
assert data == ''

def test_trim_or_append_data_one_line(self, fp_one_line, one_line_text):
data = fp_one_line.read(INIT_SNIFF_SIZE)
data = _trim_or_append_data(fp_one_line, data, INIT_SNIFF_SIZE, 0,
max_render_size=MAX_RENDER_SIZE)
fp_one_line.close()
assert data == one_line_text

def test_trim_or_append_data_error_upon_max_render_size(self, fp_oversize):
with pytest.raises(TabularRendererError):
data = fp_oversize.read(INIT_SNIFF_SIZE)
_trim_or_append_data(fp_oversize, data, INIT_SNIFF_SIZE, 0,
max_render_size=MAX_RENDER_SIZE)
fp_oversize.close()