Skip to content

Commit 1de56f0

Browse files
committed
Temp commit, pls rebase (add tests, remove loggers, etc.) [skip ci]
1 parent d1fac6d commit 1de56f0

File tree

2 files changed

+90
-6
lines changed

2 files changed

+90
-6
lines changed

mfr/extensions/tabular/libs/stdlib_tools.py

Lines changed: 87 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,52 @@
11
import re
22
import csv
3+
import logging
34

4-
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
55
from mfr.extensions.tabular import utilities
6+
from mfr.extensions.tabular.settings import MAX_FILE_SIZE, TABULAR_INIT_SNIFF_SIZE
7+
from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
8+
9+
logger = logging.getLogger(__name__)
610

711

812
def csv_stdlib(fp):
913
"""Read and convert a csv file to JSON format using the python standard library
10-
:param fp: File pointer object
11-
:return: tuple of table headers and data
14+
15+
Quirk: ``csv.Sniffer().sniff()`` needs the FULL first row and ONLY one full row to be able to
16+
effectively detect the correct dialect of the file.
17+
18+
:param fp: the file pointer object
19+
:return: a tuple of table headers and data
1220
"""
13-
data = fp.read(2048)
21+
22+
logger.info('>>> ??? &&& ~~~ current settings')
23+
logger.info('>>> ??? &&& ~~~ max rendering file size = {}'.format(MAX_FILE_SIZE))
24+
logger.info('>>> ??? &&& ~~~ initial sniffing size = {}'.format(TABULAR_INIT_SNIFF_SIZE))
25+
26+
# Prepare the first row for sniffing
27+
data = fp.read(TABULAR_INIT_SNIFF_SIZE)
28+
data = _trim_or_append_data(fp, data, TABULAR_INIT_SNIFF_SIZE, 0)
29+
30+
# Reset the file pointer
1431
fp.seek(0)
1532

33+
# Sniff the first row to find a matching format
1634
try:
1735
dialect = csv.Sniffer().sniff(data)
1836
except csv.Error:
1937
dialect = csv.excel
2038
else:
2139
_set_dialect_quote_attrs(dialect, data)
2240

41+
# Explicitly delete data when it is on longer used.
2342
del data
43+
44+
logger.info('>>> ??? &&& ~~~ dialect delimiter detected = {}'.format(dialect.delimiter))
45+
# Create the CSV reader with the detected dialect
2446
reader = csv.DictReader(fp, dialect=dialect)
47+
48+
# Update the reader field names to avoid duplicate column names when performing row extraction
2549
columns = []
26-
# update the reader field names to avoid duplicate column names when performing row extraction
2750
for idx, fieldname in enumerate(reader.fieldnames or []):
2851
column_count = sum(1 for column in columns if fieldname == column['name'])
2952
if column_count:
@@ -92,3 +115,62 @@ def _set_dialect_quote_attrs(dialect, data):
92115
dialect.quotechar = '"'
93116
if re.search('"""[[({]\'.+\',', data):
94117
dialect.doublequote = True
118+
119+
120+
def _trim_or_append_data(fp, text, read_size, sniff_size):
121+
"""Recursively read data from a file and return its first row. The file starts with ``text``
122+
and the file pointer points to the next character immediately after `text`.
123+
124+
:param fp: the file pointer from which data is read
125+
:param text: the current text chunk to check the new line character
126+
:param read_size: the last read size when `fp.read()` is called
127+
:param sniff_size: the accumulated size fo the text to sniff
128+
:return: the first row of the file in string
129+
"""
130+
131+
logger.info('>>> ??? &&& ~~~ _trim_or_append_data() ...')
132+
logger.info('>>> ??? &&& ~~~ len(text)={}\tread_size={}\tsniff_size={}'
133+
.format(len(text), read_size, sniff_size))
134+
135+
# Try to find the first new line character in the text chunk
136+
index = _find_new_line(text)
137+
# If found, return the trimmed substring
138+
if index != -1:
139+
logger.info('>>> ??? &&& ~~~ new line found @ index = {}, '
140+
'return the trimmed text'.format(index))
141+
return text[:index]
142+
# Otherwise, update `sniff_size` and then sniff more (2 times of the last `read_size`) text
143+
sniff_size += read_size
144+
read_size *= 2
145+
more_text = fp.read(read_size)
146+
147+
# If text to sniff now goes over the max file size limit, raise the renderer error since there
148+
# is no need to sniff when the file is already too large to be rendered.
149+
if sniff_size + len(more_text) >= MAX_FILE_SIZE:
150+
raise TabularRendererError(
151+
'The first row of this file is too large for the sniffer to detect the dialect. '
152+
'Please download and view it locally.',
153+
code=400,
154+
extension='csv'
155+
)
156+
# If the size is still within the limit, recursively check `more_text`
157+
logger.info('>>> ??? &&& ~~~ sniff more text')
158+
return text + _trim_or_append_data(fp, more_text, read_size, sniff_size)
159+
160+
161+
def _find_new_line(text):
162+
"""Check the text string for any type of new line character.
163+
164+
:param text: the text string to check
165+
:return: the index of the new line character if found. Otherwise, return -1.
166+
"""
167+
168+
index = text.rfind('\r\n')
169+
if index == -1:
170+
index = text.rfind('\n')
171+
if index == -1:
172+
index = text.rfind('\r')
173+
174+
logger.info('>>> ??? &&& ~~~ new line index = {}'.format(index))
175+
176+
return index

mfr/extensions/tabular/settings.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44

55
config = settings.child('TABULAR_EXTENSION_CONFIG')
66

7-
MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10Mb
7+
MAX_FILE_SIZE = int(config.get('MAX_FILE_SIZE', 10 * 1024 * 1024)) # 10MB
88
MAX_SIZE = int(config.get('MAX_SIZE', 10000)) # max number of rows or columns allowed.
99
TABLE_WIDTH = int(config.get('TABLE_WIDTH', 700))
1010
TABLE_HEIGHT = int(config.get('TABLE_HEIGHT', 600))
1111

12+
TABULAR_INIT_SNIFF_SIZE = int(config.get('TABULAR_INIT_SNIFF_SIZE', 2 * 1024)) # 2KB
13+
1214
LIBS = config.get_object('LIBS', {
1315
'.csv': [libs.csv_stdlib],
1416
'.tsv': [libs.csv_stdlib],

0 commit comments

Comments
 (0)