Skip to content

Commit 343bb78

Browse files
committed
Add type annotations
1 parent f1bf066 commit 343bb78

File tree

12 files changed

+111
-66
lines changed

12 files changed

+111
-66
lines changed

.travis.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ before_install:
1111
- sudo apt-get install libicu-dev -y
1212

1313
install:
14-
- pip install --upgrade -q PyICU cchardet six twine nose coveralls
15-
- pip install -e .
14+
- pip install --upgrade -q PyICU cchardet coveralls
15+
- pip install -e '.[dev]''
1616

1717
script:
1818
- nosetests --with-coverage --cover-package=normality

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
include LICENSE
22
include README.md
3+
include normality/py.typed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
all: clean test dists
33

44
install:
5-
pip install -q --upgrade twine pytest
5+
pip install -q '.[dev]'
66

77
test: install
88
pytest

normality/__init__.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,27 @@
1+
"""Helper functions for string cleaning.
2+
3+
`normality` includes functions to convert arbitrary Python objects to
4+
strings, transliterate them into the latin alphabet, make slugs for
5+
URLs, or perform the substitution of characters based on unicode
6+
character categories.
7+
"""
8+
from typing import Any, Optional
9+
110
from normality.cleaning import collapse_spaces, category_replace
211
from normality.constants import UNICODE_CATEGORIES, WS
312
from normality.transliteration import latinize_text, ascii_text
413
from normality.encoding import guess_encoding, guess_file_encoding # noqa
514
from normality.encoding import DEFAULT_ENCODING
615
from normality.stringify import stringify # noqa
716
from normality.paths import safe_filename # noqa
17+
from normality.util import Categories, Encoding
818

919

10-
def normalize(text, lowercase=True, collapse=True, latinize=False, ascii=False,
11-
encoding_default=DEFAULT_ENCODING, encoding=None,
12-
replace_categories=UNICODE_CATEGORIES):
20+
def normalize(text: Any, lowercase: bool = True, collapse: bool = True,
21+
latinize: bool = False, ascii: bool = False,
22+
encoding_default: Encoding = DEFAULT_ENCODING,
23+
encoding: Optional[str] = None,
24+
replace_categories: Categories = UNICODE_CATEGORIES):
1325
"""The main normalization function for text.
1426
1527
This will take a string and apply a set of transformations to it so
@@ -58,7 +70,7 @@ def normalize(text, lowercase=True, collapse=True, latinize=False, ascii=False,
5870
return text
5971

6072

61-
def slugify(text, sep='-'):
73+
def slugify(text: Any, sep: str = '-') -> Optional[str]:
6274
"""A simple slug generator."""
6375
text = stringify(text)
6476
if text is None:

normality/cleaning.py

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,84 +1,91 @@
11
import re
2-
import six
32
import unicodedata
3+
from typing import Any, Optional
44

55
from normality.constants import UNICODE_CATEGORIES, CONTROL_CODES, WS
6+
from normality.util import Categories, is_text
67

78
COLLAPSE_RE = re.compile(r'\s+', re.U)
89
BOM_RE = re.compile('^\ufeff', re.U)
910
UNSAFE_RE = re.compile(r'^\ufeff|[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f\x80-\x9f]')
1011
QUOTES_RE = re.compile(r'^["\'](.*)["\']$')
1112

1213

13-
def decompose_nfkd(text):
14+
def decompose_nfkd(text: Any) -> Optional[str]:
1415
"""Perform unicode compatibility decomposition.
1516
1617
This will replace some non-standard value representations in unicode and
1718
normalise them, while also separating characters and their diacritics into
1819
two separate codepoints.
1920
"""
20-
if is_text(text):
21-
return unicodedata.normalize('NFKD', text)
21+
if not is_text(text):
22+
return None
23+
return unicodedata.normalize('NFKD', text)
2224

2325

24-
def compose_nfc(text):
26+
def compose_nfc(text: Any) -> Optional[str]:
2527
"""Perform unicode composition."""
26-
if is_text(text):
27-
return unicodedata.normalize('NFC', text)
28+
if not is_text(text):
29+
return None
30+
return unicodedata.normalize('NFC', text)
2831

2932

30-
def compose_nfkc(text):
33+
def compose_nfkc(text: Any) -> Optional[str]:
3134
"""Perform unicode composition."""
32-
if is_text(text):
33-
return unicodedata.normalize('NFKC', text)
35+
if not is_text(text):
36+
return None
37+
return unicodedata.normalize('NFKC', text)
3438

3539

36-
def strip_quotes(text):
40+
def strip_quotes(text: Any) -> Optional[str]:
3741
"""Remove double or single quotes surrounding a string."""
38-
if is_text(text):
39-
return QUOTES_RE.sub('\\1', text)
42+
if not is_text(text):
43+
return None
44+
return QUOTES_RE.sub('\\1', text)
4045

4146

42-
def category_replace(text, replacements=UNICODE_CATEGORIES):
47+
def category_replace(text: Any,
48+
replacements: Categories = UNICODE_CATEGORIES
49+
) -> Optional[str]:
4350
"""Remove characters from a string based on unicode classes.
4451
4552
This is a method for removing non-text characters (such as punctuation,
4653
whitespace, marks and diacritics) from a piece of text by class, rather
4754
than specifying them individually.
4855
"""
56+
text = decompose_nfkd(text)
4957
if not is_text(text):
5058
return None
5159
characters = []
52-
for character in decompose_nfkd(text):
60+
for character in text:
5361
cat = unicodedata.category(character)
5462
replacement = replacements.get(cat, character)
5563
if replacement is not None:
5664
characters.append(replacement)
5765
return u''.join(characters)
5866

5967

60-
def remove_control_chars(text):
68+
def remove_control_chars(text: Any) -> Optional[str]:
6169
"""Remove just the control codes from a piece of text."""
6270
return category_replace(text, replacements=CONTROL_CODES)
6371

6472

65-
def remove_unsafe_chars(text):
73+
def remove_unsafe_chars(text) -> Optional[str]:
6674
"""Remove unsafe unicode characters from a piece of text."""
67-
if is_text(text):
68-
return UNSAFE_RE.sub('', text)
75+
if not is_text(text):
76+
return None
77+
return UNSAFE_RE.sub('', text)
6978

7079

71-
def remove_byte_order_mark(text):
80+
def remove_byte_order_mark(text) -> Optional[str]:
7281
"""Remove a BOM from the beginning of the text."""
73-
if is_text(text):
74-
return BOM_RE.sub('', text)
82+
if not is_text(text):
83+
return None
84+
return BOM_RE.sub('', text)
7585

7686

77-
def collapse_spaces(text):
87+
def collapse_spaces(text: Any) -> Optional[str]:
7888
"""Remove newlines, tabs and multiple spaces with single spaces."""
79-
if is_text(text):
80-
return COLLAPSE_RE.sub(WS, text).strip(WS)
81-
82-
83-
def is_text(data):
84-
return isinstance(data, six.text_type)
89+
if not is_text(text):
90+
return None
91+
return COLLAPSE_RE.sub(WS, text).strip(WS)

normality/constants.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from normality.util import Categories
12
# https://en.wikipedia.org/wiki/Cyrillic_script_in_Unicode
23
# Cyrillic: U+0400–U+04FF, 256 characters
34
# Cyrillic Supplement: U+0500–U+052F, 48 characters
@@ -8,13 +9,13 @@
89
# Combining Half Marks: U+FE2E–U+FE2F, 2 Cyrillic characters
910

1011

11-
WS = ' '
12+
WS: str = ' '
1213

1314
# Unicode character classes, see:
1415
# http://www.fileformat.info/info/unicode/category/index.htm
1516
# https://en.wikipedia.org/wiki/Unicode_character_property
1617
# http://www.unicode.org/charts/beta/script/
17-
UNICODE_CATEGORIES = {
18+
UNICODE_CATEGORIES: Categories = {
1819
'Cc': None,
1920
'Cf': None,
2021
'Cs': None,
@@ -39,7 +40,7 @@
3940
'So': None
4041
}
4142

42-
CONTROL_CODES = {
43+
CONTROL_CODES: Categories = {
4344
'Cc': WS,
4445
'Cf': WS,
4546
'Cs': WS,

normality/encoding.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import io
22
import codecs
33
import chardet # type: ignore
4+
from typing import cast, BinaryIO
5+
from normality.util import Encoding
46

57
DEFAULT_ENCODING = 'utf-8'
68

79

8-
def _is_encoding_codec(encoding):
10+
def _is_encoding_codec(encoding: Encoding) -> bool:
911
"""Check if a given string is a valid encoding name."""
1012
try:
1113
codecs.lookup(encoding)
@@ -14,7 +16,8 @@ def _is_encoding_codec(encoding):
1416
return False
1517

1618

17-
def normalize_encoding(encoding, default=DEFAULT_ENCODING):
19+
def normalize_encoding(encoding: str, default: Encoding = DEFAULT_ENCODING
20+
) -> str:
1821
"""Normalize the encoding name, replace ASCII w/ UTF-8."""
1922
if encoding is None:
2023
return default
@@ -29,7 +32,8 @@ def normalize_encoding(encoding, default=DEFAULT_ENCODING):
2932
return default
3033

3134

32-
def normalize_result(result, default, threshold=0.2):
35+
def normalize_result(result, default: Encoding,
36+
threshold: float = 0.2) -> Encoding:
3337
"""Interpret a chardet result."""
3438
if result is None:
3539
return default
@@ -41,7 +45,8 @@ def normalize_result(result, default, threshold=0.2):
4145
default=default)
4246

4347

44-
def guess_encoding(text, default=DEFAULT_ENCODING):
48+
def guess_encoding(text: bytes, default: Encoding = DEFAULT_ENCODING
49+
) -> Encoding:
4550
"""Guess string encoding.
4651
4752
Given a piece of text, apply character encoding detection to
@@ -51,7 +56,8 @@ def guess_encoding(text, default=DEFAULT_ENCODING):
5156
return normalize_result(result, default=default)
5257

5358

54-
def guess_file_encoding(fh, default=DEFAULT_ENCODING):
59+
def guess_file_encoding(fh: BinaryIO, default: Encoding = DEFAULT_ENCODING
60+
) -> Encoding:
5561
"""Guess encoding from a file handle."""
5662
start = fh.tell()
5763
detector = chardet.UniversalDetector()
@@ -68,7 +74,9 @@ def guess_file_encoding(fh, default=DEFAULT_ENCODING):
6874
return normalize_result(detector.result, default=default)
6975

7076

71-
def guess_path_encoding(file_path, default=DEFAULT_ENCODING):
77+
def guess_path_encoding(file_path, default: Encoding = DEFAULT_ENCODING
78+
) -> Encoding:
7279
"""Wrapper to open that damn file for you, lazy bastard."""
7380
with io.open(file_path, 'rb') as fh:
74-
return guess_file_encoding(fh, default=default)
81+
fhb = cast(BinaryIO, fh)
82+
return guess_file_encoding(fhb, default=default)

normality/py.typed

Whitespace-only changes.

normality/stringify.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
1-
import six
21
from datetime import datetime, date
32
from decimal import Decimal
3+
from typing import Any, Optional
44

55
from normality.cleaning import remove_unsafe_chars
66
from normality.encoding import guess_encoding
77
from normality.encoding import DEFAULT_ENCODING
88

99

10-
def stringify(value, encoding_default=DEFAULT_ENCODING, encoding=None):
10+
def stringify(value: Any, encoding_default: str = DEFAULT_ENCODING,
11+
encoding: Optional[str] = None) -> Optional[str]:
1112
"""Brute-force convert a given object to a string.
1213
1314
This will attempt an increasingly mean set of conversions to make a given
@@ -17,18 +18,18 @@ def stringify(value, encoding_default=DEFAULT_ENCODING, encoding=None):
1718
if value is None:
1819
return None
1920

20-
if not isinstance(value, six.text_type):
21+
if not isinstance(value, str):
2122
if isinstance(value, (date, datetime)):
2223
return value.isoformat()
2324
elif isinstance(value, (float, Decimal)):
2425
return Decimal(value).to_eng_string()
25-
elif isinstance(value, six.binary_type):
26+
elif isinstance(value, bytes):
2627
if encoding is None:
2728
encoding = guess_encoding(value, default=encoding_default)
2829
value = value.decode(encoding, 'replace')
2930
value = remove_unsafe_chars(value)
3031
else:
31-
value = six.text_type(value)
32+
value = str(value)
3233

3334
# XXX: is this really a good idea?
3435
value = value.strip()

normality/transliteration.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
# coding: utf-8
21
"""
32
Transliterate the given text to the latin script.
43
@@ -12,6 +11,8 @@
1211
if it is installed.
1312
"""
1413
import warnings
14+
from typing import Optional
15+
1516
from normality.cleaning import compose_nfkc, is_text
1617

1718
# Transform to latin, separate accents, decompose, remove
@@ -23,7 +24,7 @@ class ICUWarning(UnicodeWarning):
2324
pass
2425

2526

26-
def latinize_text(text, ascii=False):
27+
def latinize_text(text: Optional[str], ascii=False) -> Optional[str]:
2728
"""Transliterate the given text to the latin script.
2829
2930
This attempts to convert a given text to latin script using the
@@ -34,22 +35,23 @@ def latinize_text(text, ascii=False):
3435

3536
if ascii:
3637
if not hasattr(latinize_text, '_ascii'):
37-
latinize_text._ascii = make_transliterator(ASCII_SCRIPT)
38-
return latinize_text._ascii(text)
38+
latinize_text._ascii = make_trans(ASCII_SCRIPT) # type: ignore
39+
return latinize_text._ascii(text) # type: ignore
3940

4041
if not hasattr(latinize_text, '_tr'):
41-
latinize_text._tr = make_transliterator('Any-Latin')
42-
return latinize_text._tr(text)
42+
latinize_text._tr = make_trans('Any-Latin') # type: ignore
43+
return latinize_text._tr(text) # type: ignore
4344

4445

45-
def ascii_text(text):
46+
def ascii_text(text: Optional[str]) -> Optional[str]:
4647
"""Transliterate the given text and make sure it ends up as ASCII."""
4748
text = latinize_text(text, ascii=True)
48-
if is_text(text):
49-
return text.encode('ascii', 'ignore').decode('ascii')
49+
if text is None or not is_text(text):
50+
return None
51+
return text.encode('ascii', 'ignore').decode('ascii')
5052

5153

52-
def make_transliterator(script):
54+
def make_trans(script):
5355
try:
5456
from icu import Transliterator # type: ignore
5557
inst = Transliterator.createInstance(script)

normality/util.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Given the whole thing is a utility package, this is really meta.
2+
from typing import Any, Dict, Optional
3+
4+
Categories = Dict[str, Optional[str]]
5+
Encoding = str
6+
7+
8+
def is_text(data: Any) -> bool:
9+
return isinstance(data, str)

0 commit comments

Comments
 (0)