diff --git a/mfr/extensions/tabular/libs/__init__.py b/mfr/extensions/tabular/libs/__init__.py index 609dc7a35..14e9cb375 100644 --- a/mfr/extensions/tabular/libs/__init__.py +++ b/mfr/extensions/tabular/libs/__init__.py @@ -8,6 +8,11 @@ def csv_stdlib(): return csv_stdlib +def tsv_stdlib(): + from ..libs.stdlib_tools import tsv_stdlib + return tsv_stdlib + + def csv_pandas(): from ..libs.panda_tools import csv_pandas return csv_pandas diff --git a/mfr/extensions/tabular/libs/stdlib_tools.py b/mfr/extensions/tabular/libs/stdlib_tools.py index 542d5744e..2673d5084 100644 --- a/mfr/extensions/tabular/libs/stdlib_tools.py +++ b/mfr/extensions/tabular/libs/stdlib_tools.py @@ -1,57 +1,87 @@ -import re import csv +from http import HTTPStatus -from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError from mfr.extensions.tabular import utilities +from mfr.extensions.tabular.exceptions import (EmptyTableError, + TabularRendererError) def csv_stdlib(fp): - """Read and convert a csv file to JSON format using the python standard library - :param fp: File pointer object - :return: tuple of table headers and data - """ - data = fp.read(2048) + try: + # CSVs are always values seperated by commas + # sniff for quoting, and spaces after commas + dialect = csv.Sniffer().sniff(fp.read(), ',') + except: + dialect = csv.excel fp.seek(0) + reader = csv.DictReader(fp, dialect=dialect) + return parse_stdlib(reader, 'csv') + +def tsv_stdlib(fp): try: - dialect = csv.Sniffer().sniff(data) - except csv.Error: - dialect = csv.excel - else: - _set_dialect_quote_attrs(dialect, data) + # TSVs are always values seperated by TABs + # sniff for quoting, and spaces after TABs + dialect = csv.Sniffer().sniff(fp.read(), '\t') + except: + dialect = csv.excel_tab + fp.seek(0) reader = csv.DictReader(fp, dialect=dialect) + return parse_stdlib(reader, 'tsv') + +def parse_stdlib(reader, ext): + """Read and convert a csv like file to JSON format using the python standard library + :param fp: File pointer object + :return: tuple of table headers and data + """ columns = [] # update the reader field names to avoid duplicate column names when performing row extraction - for idx, fieldname in enumerate(reader.fieldnames or []): - column_count = sum(1 for column in columns if fieldname == column['name']) - if column_count: - unique_fieldname = '{}-{}'.format(fieldname, column_count + 1) - reader.fieldnames[idx] = unique_fieldname - else: - unique_fieldname = fieldname - columns.append({ - 'id': unique_fieldname, - 'field': unique_fieldname, - 'name': fieldname, - 'sortable': True, - }) - try: + for idx, fieldname in enumerate(reader.fieldnames or []): + column_count = sum(1 for column in columns if fieldname == column['name']) + if column_count: + unique_fieldname = '{}-{}'.format(fieldname, column_count + 1) + reader.fieldnames[idx] = unique_fieldname + else: + unique_fieldname = fieldname + columns.append({ + 'id': unique_fieldname, + 'field': unique_fieldname, + 'name': fieldname, + 'sortable': True, + }) + rows = [row for row in reader] except csv.Error as e: if any("field larger than field limit" in errorMsg for errorMsg in e.args): raise TabularRendererError( 'This file contains a field too large to render. ' 'Please download and view it locally.', - code=400, - extension='csv', + code=HTTPStatus.BAD_REQUEST, + extension=ext, ) from e else: - raise TabularRendererError('csv.Error: {}'.format(e), extension='csv') from e + raise TabularRendererError( + 'Cannot render file as {}. The file may be empty or corrupt'.format(ext), + code=HTTPStatus.BAD_REQUEST, + extension=ext + ) from e + + # Outside other except because the `if any` line causes more errors to be raised + # on certain exceptions + except Exception as e: + raise TabularRendererError( + 'Cannot render file as {}. The file may be empty or corrupt'.format(ext), + code=HTTPStatus.BAD_REQUEST, + extension=ext + ) from e if not columns and not rows: - raise EmptyTableError('Table empty or corrupt.', extension='csv') + raise EmptyTableError( + 'Cannot render file as {}. The file may be empty or corrupt'.format(ext), + code=HTTPStatus.BAD_REQUEST, + extension=ext) return {'Sheet 1': (columns, rows)} @@ -67,26 +97,3 @@ def sav_stdlib(fp): with open(csv_file.name, 'r') as file: csv_file.close() return csv_stdlib(file) - - -def _set_dialect_quote_attrs(dialect, data): - """Set quote-related dialect attributes based on up to 2kb of csv data. - - The regular expressions search for things that look like the beginning of - a list, wrapped in a quotation mark that is not dialect.quotechar, with - list items wrapped in dialect.quotechar and seperated by commas. - - Example matches include: - "['1', '2', '3' for quotechar == ' - '{"a", "b", "c" for quotechar == " - """ - if dialect.quotechar == '"': - if re.search('\'[[({]".+",', data): - dialect.quotechar = "'" - if re.search("'''[[({]\".+\",", data): - dialect.doublequote = True - elif dialect.quotechar == "'": - if re.search("\"[[({]'.+',", data): - dialect.quotechar = '"' - if re.search('"""[[({]\'.+\',', data): - dialect.doublequote = True diff --git a/mfr/extensions/tabular/settings.py b/mfr/extensions/tabular/settings.py index 87d46e885..8895e3cff 100644 --- a/mfr/extensions/tabular/settings.py +++ b/mfr/extensions/tabular/settings.py @@ -10,7 +10,7 @@ LIBS = config.get('LIBS', { '.csv': [libs.csv_stdlib], - '.tsv': [libs.csv_stdlib], + '.tsv': [libs.tsv_stdlib], '.gsheet': [libs.xlsx_xlrd], '.xlsx': [libs.xlsx_xlrd], '.xls': [libs.xlsx_xlrd], diff --git a/tests/extensions/ipynb/files/no_metadata.ipynb b/tests/extensions/ipynb/files/no_metadata.ipynb index 8d5457eb0..948a6718b 100644 --- a/tests/extensions/ipynb/files/no_metadata.ipynb +++ b/tests/extensions/ipynb/files/no_metadata.ipynb @@ -528,8 +528,7 @@ ] } ], - "cells": [], "metadata": {}, "nbformat": 3, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/tests/extensions/tabular/files/invalid_null.csv b/tests/extensions/tabular/files/invalid_null.csv new file mode 100644 index 000000000..c25eed4af Binary files /dev/null and b/tests/extensions/tabular/files/invalid_null.csv differ diff --git a/tests/extensions/tabular/test_stdlib_tools.py b/tests/extensions/tabular/test_stdlib_tools.py new file mode 100644 index 000000000..3d958eb01 --- /dev/null +++ b/tests/extensions/tabular/test_stdlib_tools.py @@ -0,0 +1,58 @@ +import os +from http import HTTPStatus +from collections import OrderedDict + +import pytest + +from mfr.extensions.tabular.libs import stdlib_tools +from mfr.extensions.tabular.exceptions import(EmptyTableError, + TabularRendererError) + +BASE = os.path.dirname(os.path.abspath(__file__)) + + +class TestTabularStdlibTools: + + def test_csv_stdlib(self): + with open(os.path.join(BASE, 'files', 'test.csv')) as fp: + sheets = stdlib_tools.csv_stdlib(fp) + + sheet = sheets.popitem()[1] + assert sheet[0] == [ + {'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True}, + {'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True}, + {'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True} + ] + assert sheet[1][0] == OrderedDict([('one', 'à'), ('two', 'b'), ('three', 'c')]) + assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')]) + + def test_tsv_stdlib(self): + with open(os.path.join(BASE, 'files', 'test.tsv')) as fp: + sheets = stdlib_tools.tsv_stdlib(fp) + + sheet = sheets.popitem()[1] + assert sheet[0] == [ + {'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True}, + {'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True}, + {'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True} + ] + assert sheet[1][0] == OrderedDict([('one', 'a'), ('two', 'b'), ('three', 'c')]) + assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')]) + + def test_tsv_stdlib_exception_raises(self): + with open(os.path.join(BASE, 'files', 'invalid.tsv')) as fp: + with pytest.raises(EmptyTableError) as e: + stdlib_tools.tsv_stdlib(fp) + assert e.value.code == HTTPStatus.BAD_REQUEST + + def test_csv_stdlib_exception_raises(self): + with open(os.path.join(BASE, 'files', 'invalid.csv')) as fp: + with pytest.raises(EmptyTableError) as e: + stdlib_tools.tsv_stdlib(fp) + assert e.value.code == HTTPStatus.BAD_REQUEST + + def test_csv_stdlib_other_exception_raises(self): + with open(os.path.join(BASE, 'files', 'invalid_null.csv')) as fp: + with pytest.raises(TabularRendererError) as e: + stdlib_tools.tsv_stdlib(fp) + assert e.value.code == HTTPStatus.BAD_REQUEST diff --git a/tests/extensions/zip/test_renderer.py b/tests/extensions/zip/test_renderer.py index 162e27349..fb00942d2 100644 --- a/tests/extensions/zip/test_renderer.py +++ b/tests/extensions/zip/test_renderer.py @@ -65,7 +65,7 @@ class TestZipRenderer: def test_render(self, renderer): body = renderer.render() - parsed_html = BeautifulSoup(body) + parsed_html = BeautifulSoup(body, "html.parser") rows = parsed_html.findChildren('table')[0].findChildren(['tr']) name = rows[2].findChildren('td')[0].get_text().strip()