Add type annotations

pudo · pudo · commit 343bb78fc7d4 · 2020-06-21T11:32:07.000+02:00
diff --git a/.travis.yml b/.travis.yml
@@ -11,8 +11,8 @@ before_install:
   - sudo apt-get install libicu-dev -y
 
 install:
-  - pip install --upgrade -q PyICU cchardet six twine nose coveralls
-  - pip install -e .
+  - pip install --upgrade -q PyICU cchardet coveralls
+  - pip install -e '.[dev]''
 
 script:
   - nosetests --with-coverage --cover-package=normality
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,3 @@
 include LICENSE
 include README.md
+include normality/py.typed
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 all: clean test dists
 
 install:
-	pip install -q --upgrade twine pytest
+	pip install -q '.[dev]'
 
 test: install
 	pytest
diff --git a/normality/__init__.py b/normality/__init__.py
@@ -1,15 +1,27 @@
+"""Helper functions for string cleaning.
+
+`normality` includes functions to convert arbitrary Python objects to
+strings, transliterate them into the latin alphabet, make slugs for
+URLs, or perform the substitution of characters based on unicode
+character categories.
+"""
+from typing import Any, Optional
+
 from normality.cleaning import collapse_spaces, category_replace
 from normality.constants import UNICODE_CATEGORIES, WS
 from normality.transliteration import latinize_text, ascii_text
 from normality.encoding import guess_encoding, guess_file_encoding  # noqa
 from normality.encoding import DEFAULT_ENCODING
 from normality.stringify import stringify  # noqa
 from normality.paths import safe_filename  # noqa
+from normality.util import Categories, Encoding
 
 
-def normalize(text, lowercase=True, collapse=True, latinize=False, ascii=False,
-              encoding_default=DEFAULT_ENCODING, encoding=None,
-              replace_categories=UNICODE_CATEGORIES):
+def normalize(text: Any, lowercase: bool = True, collapse: bool = True,
+              latinize: bool = False, ascii: bool = False,
+              encoding_default: Encoding = DEFAULT_ENCODING,
+              encoding: Optional[str] = None,
+              replace_categories: Categories = UNICODE_CATEGORIES):
     """The main normalization function for text.
 
     This will take a string and apply a set of transformations to it so
@@ -58,7 +70,7 @@ def normalize(text, lowercase=True, collapse=True, latinize=False, ascii=False,
     return text
 
 
-def slugify(text, sep='-'):
+def slugify(text: Any, sep: str = '-') -> Optional[str]:
     """A simple slug generator."""
     text = stringify(text)
     if text is None:
diff --git a/normality/cleaning.py b/normality/cleaning.py
@@ -1,84 +1,91 @@
 import re
-import six
 import unicodedata
+from typing import Any, Optional
 
 from normality.constants import UNICODE_CATEGORIES, CONTROL_CODES, WS
+from normality.util import Categories, is_text
 
 COLLAPSE_RE = re.compile(r'\s+', re.U)
 BOM_RE = re.compile('^\ufeff', re.U)
 UNSAFE_RE = re.compile(r'^\ufeff|[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f\x80-\x9f]')
 QUOTES_RE = re.compile(r'^["\'](.*)["\']$')
 
 
-def decompose_nfkd(text):
+def decompose_nfkd(text: Any) -> Optional[str]:
     """Perform unicode compatibility decomposition.
 
     This will replace some non-standard value representations in unicode and
     normalise them, while also separating characters and their diacritics into
     two separate codepoints.
     """
-    if is_text(text):
-        return unicodedata.normalize('NFKD', text)
+    if not is_text(text):
+        return None
+    return unicodedata.normalize('NFKD', text)
 
 
-def compose_nfc(text):
+def compose_nfc(text: Any) -> Optional[str]:
     """Perform unicode composition."""
-    if is_text(text):
-        return unicodedata.normalize('NFC', text)
+    if not is_text(text):
+        return None
+    return unicodedata.normalize('NFC', text)
 
 
-def compose_nfkc(text):
+def compose_nfkc(text: Any) -> Optional[str]:
     """Perform unicode composition."""
-    if is_text(text):
-        return unicodedata.normalize('NFKC', text)
+    if not is_text(text):
+        return None
+    return unicodedata.normalize('NFKC', text)
 
 
-def strip_quotes(text):
+def strip_quotes(text: Any) -> Optional[str]:
     """Remove double or single quotes surrounding a string."""
-    if is_text(text):
-        return QUOTES_RE.sub('\\1', text)
+    if not is_text(text):
+        return None
+    return QUOTES_RE.sub('\\1', text)
 
 
-def category_replace(text, replacements=UNICODE_CATEGORIES):
+def category_replace(text: Any,
+                     replacements: Categories = UNICODE_CATEGORIES
+                     ) -> Optional[str]:
     """Remove characters from a string based on unicode classes.
 
     This is a method for removing non-text characters (such as punctuation,
     whitespace, marks and diacritics) from a piece of text by class, rather
     than specifying them individually.
     """
+    text = decompose_nfkd(text)
     if not is_text(text):
         return None
     characters = []
-    for character in decompose_nfkd(text):
+    for character in text:
         cat = unicodedata.category(character)
         replacement = replacements.get(cat, character)
         if replacement is not None:
             characters.append(replacement)
     return u''.join(characters)
 
 
-def remove_control_chars(text):
+def remove_control_chars(text: Any) -> Optional[str]:
     """Remove just the control codes from a piece of text."""
     return category_replace(text, replacements=CONTROL_CODES)
 
 
-def remove_unsafe_chars(text):
+def remove_unsafe_chars(text) -> Optional[str]:
     """Remove unsafe unicode characters from a piece of text."""
-    if is_text(text):
-        return UNSAFE_RE.sub('', text)
+    if not is_text(text):
+        return None
+    return UNSAFE_RE.sub('', text)
 
 
-def remove_byte_order_mark(text):
+def remove_byte_order_mark(text) -> Optional[str]:
     """Remove a BOM from the beginning of the text."""
-    if is_text(text):
-        return BOM_RE.sub('', text)
+    if not is_text(text):
+        return None
+    return BOM_RE.sub('', text)
 
 
-def collapse_spaces(text):
+def collapse_spaces(text: Any) -> Optional[str]:
     """Remove newlines, tabs and multiple spaces with single spaces."""
-    if is_text(text):
-        return COLLAPSE_RE.sub(WS, text).strip(WS)
-
-
-def is_text(data):
-    return isinstance(data, six.text_type)
+    if not is_text(text):
+        return None
+    return COLLAPSE_RE.sub(WS, text).strip(WS)
diff --git a/normality/constants.py b/normality/constants.py
@@ -1,3 +1,4 @@
+from normality.util import Categories
 # https://en.wikipedia.org/wiki/Cyrillic_script_in_Unicode
 # Cyrillic: U+0400–U+04FF, 256 characters
 # Cyrillic Supplement: U+0500–U+052F, 48 characters
@@ -8,13 +9,13 @@
 # Combining Half Marks: U+FE2E–U+FE2F, 2 Cyrillic characters
 
 
-WS = ' '
+WS: str = ' '
 
 # Unicode character classes, see:
 # http://www.fileformat.info/info/unicode/category/index.htm
 # https://en.wikipedia.org/wiki/Unicode_character_property
 # http://www.unicode.org/charts/beta/script/
-UNICODE_CATEGORIES = {
+UNICODE_CATEGORIES: Categories = {
     'Cc': None,
     'Cf': None,
     'Cs': None,
@@ -39,7 +40,7 @@
     'So': None
 }
 
-CONTROL_CODES = {
+CONTROL_CODES: Categories = {
     'Cc': WS,
     'Cf': WS,
     'Cs': WS,
diff --git a/normality/encoding.py b/normality/encoding.py
@@ -1,11 +1,13 @@
 import io
 import codecs
 import chardet  # type: ignore
+from typing import cast, BinaryIO
+from normality.util import Encoding
 
 DEFAULT_ENCODING = 'utf-8'
 
 
-def _is_encoding_codec(encoding):
+def _is_encoding_codec(encoding: Encoding) -> bool:
     """Check if a given string is a valid encoding name."""
     try:
         codecs.lookup(encoding)
@@ -14,7 +16,8 @@ def _is_encoding_codec(encoding):
         return False
 
 
-def normalize_encoding(encoding, default=DEFAULT_ENCODING):
+def normalize_encoding(encoding: str, default: Encoding = DEFAULT_ENCODING
+                       ) -> str:
     """Normalize the encoding name, replace ASCII w/ UTF-8."""
     if encoding is None:
         return default
@@ -29,7 +32,8 @@ def normalize_encoding(encoding, default=DEFAULT_ENCODING):
     return default
 
 
-def normalize_result(result, default, threshold=0.2):
+def normalize_result(result, default: Encoding,
+                     threshold: float = 0.2) -> Encoding:
     """Interpret a chardet result."""
     if result is None:
         return default
@@ -41,7 +45,8 @@ def normalize_result(result, default, threshold=0.2):
                               default=default)
 
 
-def guess_encoding(text, default=DEFAULT_ENCODING):
+def guess_encoding(text: bytes, default: Encoding = DEFAULT_ENCODING
+                   ) -> Encoding:
     """Guess string encoding.
 
     Given a piece of text, apply character encoding detection to
@@ -51,7 +56,8 @@ def guess_encoding(text, default=DEFAULT_ENCODING):
     return normalize_result(result, default=default)
 
 
-def guess_file_encoding(fh, default=DEFAULT_ENCODING):
+def guess_file_encoding(fh: BinaryIO, default: Encoding = DEFAULT_ENCODING
+                        ) -> Encoding:
     """Guess encoding from a file handle."""
     start = fh.tell()
     detector = chardet.UniversalDetector()
@@ -68,7 +74,9 @@ def guess_file_encoding(fh, default=DEFAULT_ENCODING):
     return normalize_result(detector.result, default=default)
 
 
-def guess_path_encoding(file_path, default=DEFAULT_ENCODING):
+def guess_path_encoding(file_path, default: Encoding = DEFAULT_ENCODING
+                        ) -> Encoding:
     """Wrapper to open that damn file for you, lazy bastard."""
     with io.open(file_path, 'rb') as fh:
-        return guess_file_encoding(fh, default=default)
+        fhb = cast(BinaryIO, fh)
+        return guess_file_encoding(fhb, default=default)
diff --git a/normality/py.typed b/normality/py.typed
diff --git a/normality/stringify.py b/normality/stringify.py
@@ -1,13 +1,14 @@
-import six
 from datetime import datetime, date
 from decimal import Decimal
+from typing import Any, Optional
 
 from normality.cleaning import remove_unsafe_chars
 from normality.encoding import guess_encoding
 from normality.encoding import DEFAULT_ENCODING
 
 
-def stringify(value, encoding_default=DEFAULT_ENCODING, encoding=None):
+def stringify(value: Any, encoding_default: str = DEFAULT_ENCODING,
+              encoding: Optional[str] = None) -> Optional[str]:
     """Brute-force convert a given object to a string.
 
     This will attempt an increasingly mean set of conversions to make a given
@@ -17,18 +18,18 @@ def stringify(value, encoding_default=DEFAULT_ENCODING, encoding=None):
     if value is None:
         return None
 
-    if not isinstance(value, six.text_type):
+    if not isinstance(value, str):
         if isinstance(value, (date, datetime)):
             return value.isoformat()
         elif isinstance(value, (float, Decimal)):
             return Decimal(value).to_eng_string()
-        elif isinstance(value, six.binary_type):
+        elif isinstance(value, bytes):
             if encoding is None:
                 encoding = guess_encoding(value, default=encoding_default)
             value = value.decode(encoding, 'replace')
             value = remove_unsafe_chars(value)
         else:
-            value = six.text_type(value)
+            value = str(value)
 
     # XXX: is this really a good idea?
     value = value.strip()
diff --git a/normality/transliteration.py b/normality/transliteration.py
@@ -1,4 +1,3 @@
-# coding: utf-8
 """
 Transliterate the given text to the latin script.
 
@@ -12,6 +11,8 @@
 if it is installed.
 """
 import warnings
+from typing import Optional
+
 from normality.cleaning import compose_nfkc, is_text
 
 # Transform to latin, separate accents, decompose, remove
@@ -23,7 +24,7 @@ class ICUWarning(UnicodeWarning):
     pass
 
 
-def latinize_text(text, ascii=False):
+def latinize_text(text: Optional[str], ascii=False) -> Optional[str]:
     """Transliterate the given text to the latin script.
 
     This attempts to convert a given text to latin script using the
@@ -34,22 +35,23 @@ def latinize_text(text, ascii=False):
 
     if ascii:
         if not hasattr(latinize_text, '_ascii'):
-            latinize_text._ascii = make_transliterator(ASCII_SCRIPT)
-        return latinize_text._ascii(text)
+            latinize_text._ascii = make_trans(ASCII_SCRIPT)  # type: ignore
+        return latinize_text._ascii(text)  # type: ignore
 
     if not hasattr(latinize_text, '_tr'):
-        latinize_text._tr = make_transliterator('Any-Latin')
-    return latinize_text._tr(text)
+        latinize_text._tr = make_trans('Any-Latin')  # type: ignore
+    return latinize_text._tr(text)  # type: ignore
 
 
-def ascii_text(text):
+def ascii_text(text: Optional[str]) -> Optional[str]:
     """Transliterate the given text and make sure it ends up as ASCII."""
     text = latinize_text(text, ascii=True)
-    if is_text(text):
-        return text.encode('ascii', 'ignore').decode('ascii')
+    if text is None or not is_text(text):
+        return None
+    return text.encode('ascii', 'ignore').decode('ascii')
 
 
-def make_transliterator(script):
+def make_trans(script):
     try:
         from icu import Transliterator  # type: ignore
         inst = Transliterator.createInstance(script)
diff --git a/normality/util.py b/normality/util.py
@@ -0,0 +1,9 @@
+# Given the whole thing is a utility package, this is really meta.
+from typing import Any, Dict, Optional
+
+Categories = Dict[str, Optional[str]]
+Encoding = str
+
+
+def is_text(data: Any) -> bool:
+    return isinstance(data, str)
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`include LICENSE`
`2`	`2`	`include README.md`
	`3`	`+include normality/py.typed`