-
Notifications
You must be signed in to change notification settings - Fork 71
[ENG-1809] Improve .csv dialect sniffing #362
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,47 @@ | ||
import re | ||
import csv | ||
import logging | ||
|
||
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError | ||
from mfr.extensions.tabular import utilities | ||
from mfr.extensions.tabular.settings import MAX_FILE_SIZE, INIT_SNIFF_SIZE | ||
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def csv_stdlib(fp): | ||
"""Read and convert a csv file to JSON format using the python standard library | ||
:param fp: File pointer object | ||
:return: tuple of table headers and data | ||
|
||
Quirk: ``csv.Sniffer().sniff()`` needs the FULL first row and ONLY one full row to be able to | ||
effectively detect the correct dialect of the file. | ||
|
||
:param fp: the file pointer object | ||
:return: a tuple of table headers and data | ||
""" | ||
data = fp.read(2048) | ||
|
||
# Prepare the first row for sniffing | ||
data = fp.read(INIT_SNIFF_SIZE) | ||
data = _trim_or_append_data(fp, data, INIT_SNIFF_SIZE, 0) | ||
|
||
# Reset the file pointer | ||
fp.seek(0) | ||
|
||
# Sniff the first row to find a matching format | ||
try: | ||
dialect = csv.Sniffer().sniff(data) | ||
except csv.Error: | ||
dialect = csv.excel | ||
else: | ||
_set_dialect_quote_attrs(dialect, data) | ||
|
||
# Explicitly delete data when it is on longer used. | ||
del data | ||
|
||
# Create the CSV reader with the detected dialect | ||
reader = csv.DictReader(fp, dialect=dialect) | ||
|
||
# Update the reader field names to avoid duplicate column names when performing row extraction | ||
columns = [] | ||
# update the reader field names to avoid duplicate column names when performing row extraction | ||
for idx, fieldname in enumerate(reader.fieldnames or []): | ||
column_count = sum(1 for column in columns if fieldname == column['name']) | ||
if column_count: | ||
|
@@ -92,3 +110,63 @@ def _set_dialect_quote_attrs(dialect, data): | |
dialect.quotechar = '"' | ||
if re.search('"""[[({]\'.+\',', data): | ||
dialect.doublequote = True | ||
|
||
|
||
def _trim_or_append_data(fp, text, read_size, size_to_sniff, max_render_size=MAX_FILE_SIZE): | ||
"""Recursively read data from a file and return its full first row. The file starts with | ||
``text`` and the file pointer points to the next character immediately after `text`. | ||
|
||
:param fp: the file pointer from which data is read | ||
:param text: the current text chunk to check the new line character | ||
:param read_size: the last read size when `fp.read()` is called | ||
:param size_to_sniff: the accumulated size fo the text to sniff | ||
:param max_render_size: the max file size for render | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of using |
||
:return: the first row of the file in string | ||
""" | ||
|
||
# Return on empty text. This handles the corner case where the CSV is empty or only contains | ||
# one line without any new line characters. | ||
if len(text) == 0: | ||
return '' | ||
|
||
# Try to find the first new line character in the text chunk | ||
index = _find_new_line(text) | ||
# If found, return the trimmed substring | ||
if index != -1: | ||
return text[:index] | ||
# Otherwise, update `sniff_size` and then sniff more (2 times of the last `read_size`) text | ||
size_to_sniff += read_size | ||
read_size *= 2 | ||
more_text = fp.read(read_size) | ||
|
||
# If text to sniff now goes over the max file size limit, raise the renderer error since there | ||
# is no need to sniff when the file is already too large to be rendered. | ||
if size_to_sniff + len(more_text) >= max_render_size: | ||
raise TabularRendererError( | ||
'The first row of this file is too large for the sniffer to detect the dialect. ' | ||
'Please download and view it locally.', | ||
code=400, | ||
extension='csv' | ||
) | ||
# If the size is still within the limit, recursively check `more_text` | ||
return text + _trim_or_append_data(fp, more_text, read_size, size_to_sniff, | ||
max_render_size=max_render_size) | ||
|
||
|
||
def _find_new_line(text): | ||
"""In the given text string, find the index of the first occurrence of any of the three types | ||
of new line character. Note: '\n\r' is not a new line character but two, one LF and one CR. | ||
|
||
1. \r\n Carriage Return (CR) and Line Feed (LF), must be checked first | ||
2. \n LF | ||
3. \r CR | ||
|
||
:param text: the text string to check | ||
:return: the index of the first new line character if found. Otherwise, return -1. | ||
""" | ||
index = text.find('\r\n') | ||
if index == -1: | ||
index = text.find('\n') | ||
if index == -1: | ||
index = text.find('\r') | ||
Comment on lines
+167
to
+171
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure why index = text.rfind('\r\n')
if index == -1:
index = text.rfind('\n')
if index == -1:
index = text.rfind('\r') |
||
return index |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
one, line, csv |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
import logging | ||
from io import StringIO | ||
from random import choice | ||
from string import ascii_letters | ||
|
||
import pytest | ||
|
||
from mfr.extensions.tabular.exceptions import TabularRendererError | ||
from mfr.extensions.tabular.libs.stdlib_tools import _find_new_line, _trim_or_append_data | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
INIT_SNIFF_SIZE = 128 | ||
MAX_RENDER_SIZE = INIT_SNIFF_SIZE * 8 | ||
|
||
|
||
@pytest.fixture | ||
def text_with_lf(): | ||
return 'text_with_lf\nanother_row\na_third_row' | ||
|
||
|
||
@pytest.fixture | ||
def text_with_cr(): | ||
return 'text_with_cr\ranother_row\ra_third_row' | ||
|
||
|
||
@pytest.fixture | ||
def text_with_cr_lf(): | ||
return 'text_with_cr_lf\r\nanother_row\r\na_third_row' | ||
|
||
|
||
@pytest.fixture | ||
def text_without_new_line(): | ||
return 'text_without_new_line\tthe_same_row\tthe_same_row_continued' | ||
|
||
|
||
@pytest.fixture | ||
def small_text_partial(): | ||
return ''.join(choice(ascii_letters) for _ in range(INIT_SNIFF_SIZE - 2)) | ||
|
||
|
||
@pytest.fixture | ||
def fp_small(small_text_partial): | ||
return StringIO('{}\nanother_row\n'.format(small_text_partial)) | ||
|
||
|
||
@pytest.fixture | ||
def large_text_partial(): | ||
return ''.join(choice(ascii_letters) for _ in range(MAX_RENDER_SIZE - INIT_SNIFF_SIZE)) | ||
|
||
|
||
@pytest.fixture | ||
def fp_large(large_text_partial): | ||
return StringIO('{}\nanother_row\n'.format(large_text_partial)) | ||
|
||
|
||
@pytest.fixture | ||
def fp_empty(): | ||
return StringIO('') | ||
|
||
|
||
@pytest.fixture | ||
def one_line_text(): | ||
return ''.join(choice(ascii_letters) for _ in range(MAX_RENDER_SIZE - INIT_SNIFF_SIZE)) | ||
|
||
|
||
@pytest.fixture | ||
def fp_one_line(one_line_text): | ||
return StringIO(one_line_text) | ||
|
||
|
||
@pytest.fixture | ||
def fp_oversize(): | ||
oversize_text_partial = ''.join(choice(ascii_letters) for _ in range(MAX_RENDER_SIZE + 2)) | ||
return StringIO('{}the_same_row\nanother_row\n'.format(oversize_text_partial)) | ||
|
||
|
||
class TestFindNewLine: | ||
|
||
def test_find_new_line_lf(self, text_with_lf): | ||
index = _find_new_line(text_with_lf) | ||
assert index == 12 | ||
|
||
def test_find_new_line_cr(self, text_with_cr): | ||
index = _find_new_line(text_with_cr) | ||
assert index == 12 | ||
|
||
def test_find_new_line_cr_lf(self, text_with_cr_lf): | ||
index = _find_new_line(text_with_cr_lf) | ||
assert index == 15 | ||
|
||
def test_find_new_line_none(self, text_without_new_line): | ||
index = _find_new_line(text_without_new_line) | ||
assert index == -1 | ||
|
||
|
||
class TestTrimORAppendData: | ||
|
||
def test_trim_or_append_data_small(self, fp_small, small_text_partial): | ||
data = fp_small.read(INIT_SNIFF_SIZE) | ||
data = _trim_or_append_data(fp_small, data, INIT_SNIFF_SIZE, 0, | ||
max_render_size=MAX_RENDER_SIZE) | ||
fp_small.close() | ||
assert data == small_text_partial | ||
|
||
def test_trim_or_append_data_large(self, fp_large, large_text_partial): | ||
data = fp_large.read(INIT_SNIFF_SIZE) | ||
data = _trim_or_append_data(fp_large, data, INIT_SNIFF_SIZE, 0, | ||
max_render_size=MAX_RENDER_SIZE) | ||
fp_large.close() | ||
assert data == large_text_partial | ||
|
||
def test_trim_or_append_data_empty(self, fp_empty): | ||
data = fp_empty.read(INIT_SNIFF_SIZE) | ||
data = _trim_or_append_data(fp_empty, data, INIT_SNIFF_SIZE, 0, | ||
max_render_size=MAX_RENDER_SIZE) | ||
fp_empty.close() | ||
assert data == '' | ||
|
||
def test_trim_or_append_data_one_line(self, fp_one_line, one_line_text): | ||
data = fp_one_line.read(INIT_SNIFF_SIZE) | ||
data = _trim_or_append_data(fp_one_line, data, INIT_SNIFF_SIZE, 0, | ||
max_render_size=MAX_RENDER_SIZE) | ||
fp_one_line.close() | ||
assert data == one_line_text | ||
|
||
def test_trim_or_append_data_error_upon_max_render_size(self, fp_oversize): | ||
with pytest.raises(TabularRendererError): | ||
data = fp_oversize.read(INIT_SNIFF_SIZE) | ||
_trim_or_append_data(fp_oversize, data, INIT_SNIFF_SIZE, 0, | ||
max_render_size=MAX_RENDER_SIZE) | ||
fp_oversize.close() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The initial solution (see below) didn't work 100% since for some file, sniffing the full file ended up in wrong delimiter while sniffing only the first row worked as expected. This is why even when the sniffer can sniff the full file, MFR only provides it with the first row.