Skip to content

Commit df5485e

Browse files
authored
Bugfix/attempt to autodetect file encodings when reading files (#265)
* Auto-detect file encoding on read_csv_arg * add test * use on flat file reader * test ascii * log when no encoding is detected * init logger before logging * move return out of finally to appease flake8 * changelog * make FileOrString auto decoded, add test for FileOrString and file not exist case
1 parent 41467c0 commit df5485e

File tree

4 files changed

+67
-3
lines changed

4 files changed

+67
-3
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
88
The intended audience of this file is for py42 consumers -- as such, changes that don't affect
99
how a consumer would use the library (e.g. adding unit tests, updating documentation, etc) are not captured here.
1010

11+
## Unreleased
12+
13+
### Fixed
14+
15+
- Arguments/options that read data from files now attempt to autodetect file encodings. Resolving a bug where CSVs written
16+
on Windows with Powershell would fail to be read properly.
17+
1118
## 1.4.0 - 2021-03-09
1219

1320
### Added

src/code42cli/click_ext/types.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,29 @@
33
from datetime import timedelta
44
from datetime import timezone
55

6+
import chardet
67
import click
78
from click.exceptions import BadParameter
89

10+
from code42cli.logger import CliLogger
911

10-
class FileOrString(click.File):
12+
13+
class AutoDecodedFile(click.File):
14+
"""Attempts to autodetect file's encoding prior to normal click.File processing."""
15+
16+
def convert(self, value, param, ctx):
17+
try:
18+
with open(value, "rb") as file:
19+
self.encoding = chardet.detect(file.read())["encoding"]
20+
if self.encoding is None:
21+
CliLogger().log_error(f"Failed to detect encoding of file: {value}")
22+
except Exception:
23+
pass # we'll let click.File do it's own exception handling for the filepath
24+
25+
return super().convert(value, param, ctx)
26+
27+
28+
class FileOrString(AutoDecodedFile):
1129
"""Declares a parameter to be a file (if the argument begins with `@`), otherwise accepts it as
1230
a string.
1331
"""

src/code42cli/file_readers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import click
44

5+
from code42cli.click_ext.types import AutoDecodedFile
56
from code42cli.errors import Code42CLIError
67

78

@@ -13,7 +14,7 @@ def read_csv_arg(headers):
1314
return click.argument(
1415
"csv_rows",
1516
metavar="CSV_FILE",
16-
type=click.File("r"),
17+
type=AutoDecodedFile("r"),
1718
callback=lambda ctx, param, arg: read_csv(arg, headers=headers),
1819
)
1920

@@ -65,7 +66,7 @@ def read_flat_file(file):
6566

6667
read_flat_file_arg = click.argument(
6768
"file_rows",
68-
type=click.File("r"),
69+
type=AutoDecodedFile("r"),
6970
metavar="FILE",
7071
callback=lambda ctx, param, arg: read_flat_file(arg),
7172
)

tests/test_file_readers.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
import click.exceptions
12
import pytest
23

4+
from code42cli.click_ext.types import AutoDecodedFile
5+
from code42cli.click_ext.types import FileOrString
36
from code42cli.errors import Code42CLIError
47
from code42cli.file_readers import read_csv
58

@@ -62,3 +65,38 @@ def test_read_csv_when_some_but_not_all_required_headers_present_raises(runner):
6265
with open("test_csv.csv") as csv:
6366
with pytest.raises(Code42CLIError):
6467
read_csv(file=csv, headers=HEADERS + ["extra_header"])
68+
69+
70+
@pytest.mark.parametrize(
71+
"encoding", ["utf8", "utf16", "latin_1"],
72+
)
73+
def test_read_csv_reads_various_encodings_automatically(runner, encoding):
74+
with runner.isolated_filesystem():
75+
with open("test.csv", "w", encoding=encoding) as file:
76+
file.write("".join(HEADERED_CSV))
77+
78+
csv = AutoDecodedFile("r").convert("test.csv", None, None)
79+
result_list = read_csv(csv, headers=HEADERS)
80+
81+
assert result_list == [
82+
{"header1": "col1_val1", "header2": "col2_val1", "header3": "col3_val1"},
83+
{"header1": "col1_val2", "header2": "col2_val2", "header3": "col3_val2"},
84+
]
85+
86+
87+
def test_AutoDecodedFile_raises_expected_exception_when_file_not_exists(runner):
88+
with pytest.raises(click.exceptions.BadParameter):
89+
AutoDecodedFile("r").convert("not_a_file", None, None)
90+
91+
92+
@pytest.mark.parametrize(
93+
"encoding", ["utf8", "utf16", "latin_1"],
94+
)
95+
def test_FileOrString_arg_handles_various_encodings_automatically(runner, encoding):
96+
test_data = '{"tést": "dåta"}'
97+
with runner.isolated_filesystem():
98+
with open("test1.json", "w", encoding=encoding) as file:
99+
file.write(test_data)
100+
101+
result_data = FileOrString().convert("@test1.json", None, None)
102+
assert result_data == test_data

0 commit comments

Comments
 (0)