|
6 | 6 | """
|
7 | 7 |
|
8 | 8 | from __future__ import absolute_import
|
9 |
| -import regex as re |
| 9 | + |
10 | 10 | import logging
|
11 | 11 | from copy import deepcopy
|
12 | 12 |
|
13 |
| -from lxml import html, etree |
14 |
| - |
15 |
| -from talon.utils import (get_delimiter, html_tree_to_text, |
16 |
| - html_document_fromstring) |
17 |
| -from talon import html_quotations |
| 13 | +import regex as re |
| 14 | +from lxml import etree, html |
18 | 15 | from six.moves import range
|
19 |
| -import six |
20 | 16 |
|
| 17 | +from talon import html_quotations |
| 18 | +from talon.utils import (get_delimiter, html_document_fromstring, |
| 19 | + html_tree_to_text) |
21 | 20 |
|
22 | 21 | log = logging.getLogger(__name__)
|
23 | 22 |
|
|
94 | 93 | )
|
95 | 94 |
|
96 | 95 | RE_QUOTATION = re.compile(
|
97 |
| - r''' |
| 96 | + r""" |
98 | 97 | (
|
99 | 98 | # quotation border: splitter line or a number of quotation marker lines
|
100 | 99 | (?:
|
|
112 | 111 |
|
113 | 112 | # after quotations should be text only or nothing at all
|
114 | 113 | [te]*$
|
115 |
| - ''', re.VERBOSE) |
| 114 | + """, re.VERBOSE) |
116 | 115 |
|
117 | 116 | RE_EMPTY_QUOTATION = re.compile(
|
118 |
| - r''' |
| 117 | + r""" |
119 | 118 | (
|
120 | 119 | # quotation border: splitter line or a number of quotation marker lines
|
121 | 120 | (?:
|
|
125 | 124 | )
|
126 | 125 | )
|
127 | 126 | e*
|
128 |
| - ''', re.VERBOSE) |
| 127 | + """, re.VERBOSE) |
129 | 128 |
|
130 | 129 | # ------Original Message------ or ---- Reply Message ----
|
131 | 130 | # With variations in other languages.
|
@@ -343,9 +342,6 @@ def _replace_link_brackets(msg_body):
|
343 | 342 |
|
344 | 343 | Converts msg_body into a unicode
|
345 | 344 | """
|
346 |
| - if isinstance(msg_body, bytes): |
347 |
| - msg_body = msg_body.decode('utf8') |
348 |
| - |
349 | 345 | def link_wrapper(link):
|
350 | 346 | newline_index = msg_body[:link.start()].rfind("\n")
|
351 | 347 | if msg_body[newline_index + 1] == ">":
|
@@ -385,8 +381,6 @@ def postprocess(msg_body):
|
385 | 381 |
|
386 | 382 | def extract_from_plain(msg_body):
|
387 | 383 | """Extracts a non quoted message from provided plain text."""
|
388 |
| - stripped_text = msg_body |
389 |
| - |
390 | 384 | delimiter = get_delimiter(msg_body)
|
391 | 385 | msg_body = preprocess(msg_body, delimiter)
|
392 | 386 | # don't process too long messages
|
@@ -418,17 +412,13 @@ def extract_from_html(msg_body):
|
418 | 412 |
|
419 | 413 | Returns a unicode string.
|
420 | 414 | """
|
421 |
| - msg_body_bytes = msg_body |
422 |
| - if isinstance(msg_body, six.text_type): |
423 |
| - msg_body_bytes = msg_body.encode('utf8') |
424 |
| - |
425 |
| - if msg_body_bytes.strip() == b'': |
| 415 | + if msg_body.strip() == "": |
426 | 416 | return msg_body
|
427 | 417 |
|
428 |
| - msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n') |
| 418 | + msg_body = msg_body.replace("\r\n", "\n") |
429 | 419 | # Cut out xml and doctype tags to avoid conflict with unicode decoding.
|
430 |
| - msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes) |
431 |
| - html_tree = html_document_fromstring(msg_body_bytes) |
| 420 | + msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) |
| 421 | + html_tree = html_document_fromstring(msg_body) |
432 | 422 | if html_tree is None:
|
433 | 423 | return msg_body
|
434 | 424 |
|
@@ -531,11 +521,11 @@ def extract_from_html_tree(html_tree):
|
531 | 521 | # of replacing data outside the <tag> which might be essential to
|
532 | 522 | # the customer.
|
533 | 523 | remove_namespaces(html_tree_copy)
|
534 |
| - s = html.tostring(html_tree_copy) |
| 524 | + s = html.tostring(html_tree_copy, encoding="ascii") |
535 | 525 | if not s:
|
536 | 526 | return None
|
537 | 527 |
|
538 |
| - return s.decode('utf-8') |
| 528 | + return s.decode("ascii") |
539 | 529 |
|
540 | 530 |
|
541 | 531 | def remove_namespaces(root):
|
@@ -654,23 +644,23 @@ def _readable_text_empty(html_tree):
|
654 | 644 |
|
655 | 645 |
|
656 | 646 | def is_splitter(line):
|
657 |
| - ''' |
| 647 | + """ |
658 | 648 | Returns Matcher object if provided string is a splitter and
|
659 | 649 | None otherwise.
|
660 |
| - ''' |
| 650 | + """ |
661 | 651 | for pattern in SPLITTER_PATTERNS:
|
662 | 652 | matcher = re.match(pattern, line)
|
663 | 653 | if matcher:
|
664 | 654 | return matcher
|
665 | 655 |
|
666 | 656 |
|
667 | 657 | def text_content(context):
|
668 |
| - '''XPath Extension function to return a node text content.''' |
| 658 | + """XPath Extension function to return a node text content.""" |
669 | 659 | return context.context_node.xpath("string()").strip()
|
670 | 660 |
|
671 | 661 |
|
672 | 662 | def tail(context):
|
673 |
| - '''XPath Extension function to return a node tail text.''' |
| 663 | + """XPath Extension function to return a node tail text.""" |
674 | 664 | return context.context_node.tail or ''
|
675 | 665 |
|
676 | 666 |
|
|
0 commit comments