Skip to content

Commit 60521e9

Browse files
committed
CSV sniffer sniffs the full first row for dialect detection
[skip ci]
1 parent d1fac6d commit 60521e9

File tree

2 files changed

+75
-6
lines changed

2 files changed

+75
-6
lines changed

mfr/extensions/tabular/libs/stdlib_tools.py

Lines changed: 72 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,47 @@
11
import re
22
import csv
3+
import logging
34

4-
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
55
from mfr.extensions.tabular import utilities
6+
from mfr.extensions.tabular.settings import MAX_FILE_SIZE, INIT_SNIFF_SIZE
7+
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
8+
9+
logger = logging.getLogger(__name__)
610

711

812
def csv_stdlib(fp):
913
"""Read and convert a csv file to JSON format using the python standard library
10-
:param fp: File pointer object
11-
:return: tuple of table headers and data
14+
15+
Quirk: ``csv.Sniffer().sniff()`` needs the FULL first row and ONLY one full row to be able to
16+
effectively detect the correct dialect of the file.
17+
18+
:param fp: the file pointer object
19+
:return: a tuple of table headers and data
1220
"""
13-
data = fp.read(2048)
21+
22+
# Prepare the first row for sniffing
23+
data = fp.read(INIT_SNIFF_SIZE)
24+
data = _trim_or_append_data(fp, data, INIT_SNIFF_SIZE, 0)
25+
26+
# Reset the file pointer
1427
fp.seek(0)
1528

29+
# Sniff the first row to find a matching format
1630
try:
1731
dialect = csv.Sniffer().sniff(data)
1832
except csv.Error:
1933
dialect = csv.excel
2034
else:
2135
_set_dialect_quote_attrs(dialect, data)
2236

37+
# Explicitly delete data when it is on longer used.
2338
del data
39+
40+
# Create the CSV reader with the detected dialect
2441
reader = csv.DictReader(fp, dialect=dialect)
42+
43+
# Update the reader field names to avoid duplicate column names when performing row extraction
2544
columns = []
26-
# update the reader field names to avoid duplicate column names when performing row extraction
2745
for idx, fieldname in enumerate(reader.fieldnames or []):
2846
column_count = sum(1 for column in columns if fieldname == column['name'])
2947
if column_count:
@@ -92,3 +110,52 @@ def _set_dialect_quote_attrs(dialect, data):
92110
dialect.quotechar = '"'
93111
if re.search('"""[[({]\'.+\',', data):
94112
dialect.doublequote = True
113+
114+
115+
def _trim_or_append_data(fp, text, read_size, sniff_size):
116+
"""Recursively read data from a file and return its first row. The file starts with ``text``
117+
and the file pointer points to the next character immediately after `text`.
118+
119+
:param fp: the file pointer from which data is read
120+
:param text: the current text chunk to check the new line character
121+
:param read_size: the last read size when `fp.read()` is called
122+
:param sniff_size: the accumulated size fo the text to sniff
123+
:return: the first row of the file in string
124+
"""
125+
126+
# Try to find the first new line character in the text chunk
127+
index = _find_new_line(text)
128+
# If found, return the trimmed substring
129+
if index != -1:
130+
return text[:index]
131+
# Otherwise, update `sniff_size` and then sniff more (2 times of the last `read_size`) text
132+
sniff_size += read_size
133+
read_size *= 2
134+
more_text = fp.read(read_size)
135+
136+
# If text to sniff now goes over the max file size limit, raise the renderer error since there
137+
# is no need to sniff when the file is already too large to be rendered.
138+
if sniff_size + len(more_text) >= MAX_FILE_SIZE:
139+
raise TabularRendererError(
140+
'The first row of this file is too large for the sniffer to detect the dialect. '
141+
'Please download and view it locally.',
142+
code=400,
143+
extension='csv'
144+
)
145+
# If the size is still within the limit, recursively check `more_text`
146+
return text + _trim_or_append_data(fp, more_text, read_size, sniff_size)
147+
148+
149+
def _find_new_line(text):
150+
"""Check the text string for any type of new line character.
151+
152+
:param text: the text string to check
153+
:return: the index of the new line character if found. Otherwise, return -1.
154+
"""
155+
156+
index = text.rfind('\r\n')
157+
if index == -1:
158+
index = text.rfind('\n')
159+
if index == -1:
160+
index = text.rfind('\r')
161+
return index

mfr/extensions/tabular/settings.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44

55
config = settings.child('TABULAR_EXTENSION_CONFIG')
66

7-
MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10Mb
7+
MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10MB
88
MAX_SIZE = int(config.get('MAX_SIZE', 10000)) # max number of rows or columns allowed.
99
TABLE_WIDTH = int(config.get('TABLE_WIDTH', 700))
1010
TABLE_HEIGHT = int(config.get('TABLE_HEIGHT', 600))
1111

12+
INIT_SNIFF_SIZE = int(config.get('INIT_SNIFF_SIZE', 2 * 1024)) # 2KB
13+
1214
LIBS = config.get_object('LIBS', {
1315
'.csv': [libs.csv_stdlib],
1416
'.tsv': [libs.csv_stdlib],

0 commit comments

Comments
 (0)