diff --git a/setup.py b/setup.py index e9b91d2..2d36d9c 100644 --- a/setup.py +++ b/setup.py @@ -4,13 +4,12 @@ """Setup script for speedparser.""" from setuptools import setup, find_packages -import sys, os try: from speedparser import VERSION version = ".".join(map(str, VERSION)) except: - version = '0.2.0' + version = '0.2.1' # some trove classifiers: @@ -45,6 +44,7 @@ # 'feedparser>=0.5', 'lxml', 'chardet', + 'future' ], entry_points=""" # -*- Entry points: -*- diff --git a/speedparser/__init__.py b/speedparser/__init__.py index b41b16f..83f2cc6 100644 --- a/speedparser/__init__.py +++ b/speedparser/__init__.py @@ -1,3 +1,3 @@ from .speedparser import parse -VERSION = (0,2,0) +VERSION = (0,2,1) __all__ = ['parse', 'VERSION'] diff --git a/speedparser/speedparser.py b/speedparser/speedparser.py index 14c0245..73c59dc 100644 --- a/speedparser/speedparser.py +++ b/speedparser/speedparser.py @@ -14,6 +14,8 @@ """ +from __future__ import absolute_import, division, print_function +from builtins import str import re import time try: @@ -27,7 +29,7 @@ try: import feedparser except: - import feedparsercompat as feedparser + from . import feedparsercompat as feedparser keymap = feedparser.FeedParserDict.keymap fpnamespaces = feedparser._FeedParserMixin.namespaces @@ -95,7 +97,7 @@ def strip_outer_tag(text): """Strips the outer tag, if text starts with a tag. Not entity aware; designed to quickly strip outer tags from lxml cleaner output. Only checks for

and

outer tags.""" - if not text or not isinstance(text, basestring): + if not text or not isinstance(text, str): return text stripped = text.strip() if (stripped.startswith('

') or stripped.startswith('

')) and \ @@ -104,17 +106,21 @@ def strip_outer_tag(text): return text nsre = re.compile(r'xmlns\s*=\s*[\'"](.+?)[\'"]') +nsreb = re.compile(rb'xmlns\s*=\s*[\'"](.+?)[\'"]') def strip_namespace(document): - if document[:1000].count('xmlns') > 5: - if 'xmlns' not in document[:1000]: + # convert our bytes to a unicode string so we can search and slice. + decoded = document.decode('utf8') + if decoded[:1000].count('xmlns') > 5: + if 'xmlns' not in decoded[:1000]: return None, document - elif 'xmlns' not in document[:400]: + elif 'xmlns' not in decoded[:400]: return None, document - match = nsre.search(document) + match = nsre.search(decoded) if match: - return match.groups()[0], nsre.sub('', document) + print(f"mg type: {type(match.groups()[0])}; doc type: {type(document)}") + return match.groups()[0], nsreb.sub(rb'', document) return None, document @@ -145,13 +151,13 @@ def munge_author(author): def reverse_namespace_map(nsmap): d = fpnamespaces.copy() - d.update(dict([(v, k) for (k, v) in nsmap.iteritems()])) + d.update(dict([(v, k) for (k, v) in nsmap.items()])) return d def base_url(root): """Determine the base url for a root element.""" - for attr, value in root.attrib.iteritems(): + for attr, value in root.attrib.items(): if attr.endswith('base') and 'http' in value: return value return None @@ -165,7 +171,7 @@ def full_href_attribs(attribs, base=None): if base is None: return dict(attribs) d = dict(attribs) - for key, value in d.iteritems(): + for key, value in d.items(): if key == 'href': d[key] = full_href(value, base) return d @@ -248,7 +254,7 @@ def __init__(self, root, namespaces={}, version='rss20', encoding='utf-8', feed= self.entries = entries def clean(self, text): - if text and isinstance(text, basestring): + if text and isinstance(text, str): return self.cleaner.clean_html(text) return text @@ -504,10 +510,11 @@ def __init__(self, root, namespaces={}, encoding='utf-8', type='rss20', cleaner= self.feed = feed def clean(self, text, outer_tag=True): - if text and isinstance(text, basestring): - if not outer_tag: - txt = self.cleaner.clean_html(text) - frag = lxml.html.fragment_fromstring(txt) + if text and isinstance(text, str): + # txt and frag aren't used, this appears to be no-op code. + #if not outer_tag: + # txt = self.cleaner.clean_html(text) + # frag = lxml.html.fragment_fromstring(txt) return self.cleaner.clean_html(text) return text @@ -589,7 +596,7 @@ def __init__(self, content, cleaner=default_cleaner, unix_timestamp=False, encod if self.version in self.version_map: self.version = self.version_map[self.version] if 'unk' in self.version: - raise IncompatibleFeedError("Could not determine version of this feed.") + raise IncompatibleFeedError(f"Could not determine version ({self.version}) of this feed.") self.namespaces = self.parse_namespaces() self.feed = self.parse_feed(self.version, self.encoding) self.entries = self.parse_entries(self.version, self.encoding) @@ -599,14 +606,17 @@ def parse_version(self): root_ns, root_tag = clean_ns(r.tag) root_tag = root_tag.lower() vers = 'unk' + print(f"xmlns: {self.xmlns}") if self.xmlns and self.xmlns.lower() in xmlns_map: value = xmlns_map[self.xmlns.lower()] if value == 'rss10' and root_tag == 'rss': value = 'rss010' if not (value.startswith('atom') and root_tag == 'rss'): return value - elif self.xmlns: + elif self.xmlns and len(self.xmlns.split('/')) > 2: vers = self.xmlns.split('/')[-2].replace('.', '') + elif self.xmlns and len(self.xmlns.split(':')) > 2: + vers = self.xmlns.split(':')[-2].replace('.', '') tag = root_tag if r.attrib.get('version', None): vers = r.attrib['version'].replace('.', '')