Skip to content

Commit 71d9b6e

Browse files
authored
Merge pull request #227 from mailgun/maxim/develop
PIP-1615: Operate on unicode data exclusively [python3]
2 parents a8c7e6a + 14f106e commit 71d9b6e

File tree

6 files changed

+72
-266
lines changed

6 files changed

+72
-266
lines changed

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def finalize_options(self):
2929

3030

3131
setup(name='talon',
32-
version='1.5.0',
32+
version='1.6.0',
3333
description=("Mailgun library "
3434
"to extract message quotations and signatures."),
3535
long_description=open("README.rst").read(),

talon/quotations.py

+20-30
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,17 @@
66
"""
77

88
from __future__ import absolute_import
9-
import regex as re
9+
1010
import logging
1111
from copy import deepcopy
1212

13-
from lxml import html, etree
14-
15-
from talon.utils import (get_delimiter, html_tree_to_text,
16-
html_document_fromstring)
17-
from talon import html_quotations
13+
import regex as re
14+
from lxml import etree, html
1815
from six.moves import range
19-
import six
2016

17+
from talon import html_quotations
18+
from talon.utils import (get_delimiter, html_document_fromstring,
19+
html_tree_to_text)
2120

2221
log = logging.getLogger(__name__)
2322

@@ -94,7 +93,7 @@
9493
)
9594

9695
RE_QUOTATION = re.compile(
97-
r'''
96+
r"""
9897
(
9998
# quotation border: splitter line or a number of quotation marker lines
10099
(?:
@@ -112,10 +111,10 @@
112111
113112
# after quotations should be text only or nothing at all
114113
[te]*$
115-
''', re.VERBOSE)
114+
""", re.VERBOSE)
116115

117116
RE_EMPTY_QUOTATION = re.compile(
118-
r'''
117+
r"""
119118
(
120119
# quotation border: splitter line or a number of quotation marker lines
121120
(?:
@@ -125,7 +124,7 @@
125124
)
126125
)
127126
e*
128-
''', re.VERBOSE)
127+
""", re.VERBOSE)
129128

130129
# ------Original Message------ or ---- Reply Message ----
131130
# With variations in other languages.
@@ -343,9 +342,6 @@ def _replace_link_brackets(msg_body):
343342
344343
Converts msg_body into a unicode
345344
"""
346-
if isinstance(msg_body, bytes):
347-
msg_body = msg_body.decode('utf8')
348-
349345
def link_wrapper(link):
350346
newline_index = msg_body[:link.start()].rfind("\n")
351347
if msg_body[newline_index + 1] == ">":
@@ -385,8 +381,6 @@ def postprocess(msg_body):
385381

386382
def extract_from_plain(msg_body):
387383
"""Extracts a non quoted message from provided plain text."""
388-
stripped_text = msg_body
389-
390384
delimiter = get_delimiter(msg_body)
391385
msg_body = preprocess(msg_body, delimiter)
392386
# don't process too long messages
@@ -418,17 +412,13 @@ def extract_from_html(msg_body):
418412
419413
Returns a unicode string.
420414
"""
421-
msg_body_bytes = msg_body
422-
if isinstance(msg_body, six.text_type):
423-
msg_body_bytes = msg_body.encode('utf8')
424-
425-
if msg_body_bytes.strip() == b'':
415+
if msg_body.strip() == "":
426416
return msg_body
427417

428-
msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n')
418+
msg_body = msg_body.replace("\r\n", "\n")
429419
# Cut out xml and doctype tags to avoid conflict with unicode decoding.
430-
msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes)
431-
html_tree = html_document_fromstring(msg_body_bytes)
420+
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
421+
html_tree = html_document_fromstring(msg_body)
432422
if html_tree is None:
433423
return msg_body
434424

@@ -531,11 +521,11 @@ def extract_from_html_tree(html_tree):
531521
# of replacing data outside the <tag> which might be essential to
532522
# the customer.
533523
remove_namespaces(html_tree_copy)
534-
s = html.tostring(html_tree_copy)
524+
s = html.tostring(html_tree_copy, encoding="ascii")
535525
if not s:
536526
return None
537527

538-
return s.decode('utf-8')
528+
return s.decode("ascii")
539529

540530

541531
def remove_namespaces(root):
@@ -654,23 +644,23 @@ def _readable_text_empty(html_tree):
654644

655645

656646
def is_splitter(line):
657-
'''
647+
"""
658648
Returns Matcher object if provided string is a splitter and
659649
None otherwise.
660-
'''
650+
"""
661651
for pattern in SPLITTER_PATTERNS:
662652
matcher = re.match(pattern, line)
663653
if matcher:
664654
return matcher
665655

666656

667657
def text_content(context):
668-
'''XPath Extension function to return a node text content.'''
658+
"""XPath Extension function to return a node text content."""
669659
return context.context_node.xpath("string()").strip()
670660

671661

672662
def tail(context):
673-
'''XPath Extension function to return a node tail text.'''
663+
"""XPath Extension function to return a node tail text."""
674664
return context.context_node.tail or ''
675665

676666

talon/signature/learning/helpers.py

+11-18
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,17 @@
55
* regexp's constants used when evaluating signature's features
66
77
"""
8-
9-
from __future__ import absolute_import
108
import unicodedata
11-
import regex as re
129

13-
from talon.utils import to_unicode
10+
import regex as re
1411

1512
from talon.signature.constants import SIGNATURE_MAX_LINES
1613

17-
1814
rc = re.compile
1915

2016
RE_EMAIL = rc('\S@\S')
2117
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
22-
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
18+
RE_URL = rc(r"""https?://|www\.[\S]+\.[\S]""")
2319

2420
# Taken from:
2521
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
@@ -55,7 +51,7 @@
5551

5652

5753
def binary_regex_search(prog):
58-
'''Returns a function that returns 1 or 0 depending on regex search result.
54+
"""Returns a function that returns 1 or 0 depending on regex search result.
5955
6056
If regular expression compiled into prog is present in a string
6157
the result of calling the returned function with the string will be 1
@@ -66,12 +62,12 @@ def binary_regex_search(prog):
6662
1
6763
>>> binary_regex_search(re.compile("12"))("34")
6864
0
69-
'''
65+
"""
7066
return lambda s: 1 if prog.search(s) else 0
7167

7268

7369
def binary_regex_match(prog):
74-
'''Returns a function that returns 1 or 0 depending on regex match result.
70+
"""Returns a function that returns 1 or 0 depending on regex match result.
7571
7672
If a string matches regular expression compiled into prog
7773
the result of calling the returned function with the string will be 1
@@ -82,7 +78,7 @@ def binary_regex_match(prog):
8278
1
8379
>>> binary_regex_match(re.compile("12"))("3 12")
8480
0
85-
'''
81+
"""
8682
return lambda s: 1 if prog.match(s) else 0
8783

8884

@@ -135,7 +131,6 @@ def extract_names(sender):
135131
>>> extract_names('')
136132
[]
137133
"""
138-
sender = to_unicode(sender, precise=True)
139134
# Remove non-alphabetical characters
140135
sender = "".join([char if char.isalpha() else ' ' for char in sender])
141136
# Remove too short words and words from "black" list i.e.
@@ -154,7 +149,7 @@ def extract_names(sender):
154149

155150

156151
def categories_percent(s, categories):
157-
'''Returns category characters percent.
152+
"""Returns category characters percent.
158153
159154
>>> categories_percent("qqq ggg hhh", ["Po"])
160155
0.0
@@ -166,29 +161,27 @@ def categories_percent(s, categories):
166161
50.0
167162
>>> categories_percent("s.s,5s", ["Po", "Nd"])
168163
50.0
169-
'''
164+
"""
170165
count = 0
171-
s = to_unicode(s, precise=True)
172166
for c in s:
173167
if unicodedata.category(c) in categories:
174168
count += 1
175169
return 100 * float(count) / len(s) if len(s) else 0
176170

177171

178172
def punctuation_percent(s):
179-
'''Returns punctuation percent.
173+
"""Returns punctuation percent.
180174
181175
>>> punctuation_percent("qqq ggg hhh")
182176
0.0
183177
>>> punctuation_percent("q,w.")
184178
50.0
185-
'''
179+
"""
186180
return categories_percent(s, ['Po'])
187181

188182

189183
def capitalized_words_percent(s):
190-
'''Returns capitalized words percent.'''
191-
s = to_unicode(s, precise=True)
184+
"""Returns capitalized words percent."""
192185
words = re.split('\s', s)
193186
words = [w for w in words if w.strip()]
194187
words = [w for w in words if len(w) > 2]

0 commit comments

Comments
 (0)