diff --git a/setup.py b/setup.py
index e9b91d2..2d36d9c 100644
--- a/setup.py
+++ b/setup.py
@@ -4,13 +4,12 @@
"""Setup script for speedparser."""
from setuptools import setup, find_packages
-import sys, os
try:
from speedparser import VERSION
version = ".".join(map(str, VERSION))
except:
- version = '0.2.0'
+ version = '0.2.1'
# some trove classifiers:
@@ -45,6 +44,7 @@
# 'feedparser>=0.5',
'lxml',
'chardet',
+ 'future'
],
entry_points="""
# -*- Entry points: -*-
diff --git a/speedparser/__init__.py b/speedparser/__init__.py
index b41b16f..83f2cc6 100644
--- a/speedparser/__init__.py
+++ b/speedparser/__init__.py
@@ -1,3 +1,3 @@
from .speedparser import parse
-VERSION = (0,2,0)
+VERSION = (0,2,1)
__all__ = ['parse', 'VERSION']
diff --git a/speedparser/speedparser.py b/speedparser/speedparser.py
index 14c0245..73c59dc 100644
--- a/speedparser/speedparser.py
+++ b/speedparser/speedparser.py
@@ -14,6 +14,8 @@
"""
+from __future__ import absolute_import, division, print_function
+from builtins import str
import re
import time
try:
@@ -27,7 +29,7 @@
try:
import feedparser
except:
- import feedparsercompat as feedparser
+ from . import feedparsercompat as feedparser
keymap = feedparser.FeedParserDict.keymap
fpnamespaces = feedparser._FeedParserMixin.namespaces
@@ -95,7 +97,7 @@ def strip_outer_tag(text):
"""Strips the outer tag, if text starts with a tag. Not entity aware;
designed to quickly strip outer tags from lxml cleaner output. Only
checks for
outer tags."""
- if not text or not isinstance(text, basestring):
+ if not text or not isinstance(text, str):
return text
stripped = text.strip()
if (stripped.startswith('
') or stripped.startswith('
')) and \
@@ -104,17 +106,21 @@ def strip_outer_tag(text):
return text
nsre = re.compile(r'xmlns\s*=\s*[\'"](.+?)[\'"]')
+nsreb = re.compile(rb'xmlns\s*=\s*[\'"](.+?)[\'"]')
def strip_namespace(document):
- if document[:1000].count('xmlns') > 5:
- if 'xmlns' not in document[:1000]:
+ # convert our bytes to a unicode string so we can search and slice.
+ decoded = document.decode('utf8')
+ if decoded[:1000].count('xmlns') > 5:
+ if 'xmlns' not in decoded[:1000]:
return None, document
- elif 'xmlns' not in document[:400]:
+ elif 'xmlns' not in decoded[:400]:
return None, document
- match = nsre.search(document)
+ match = nsre.search(decoded)
if match:
- return match.groups()[0], nsre.sub('', document)
+ print(f"mg type: {type(match.groups()[0])}; doc type: {type(document)}")
+ return match.groups()[0], nsreb.sub(rb'', document)
return None, document
@@ -145,13 +151,13 @@ def munge_author(author):
def reverse_namespace_map(nsmap):
d = fpnamespaces.copy()
- d.update(dict([(v, k) for (k, v) in nsmap.iteritems()]))
+ d.update(dict([(v, k) for (k, v) in nsmap.items()]))
return d
def base_url(root):
"""Determine the base url for a root element."""
- for attr, value in root.attrib.iteritems():
+ for attr, value in root.attrib.items():
if attr.endswith('base') and 'http' in value:
return value
return None
@@ -165,7 +171,7 @@ def full_href_attribs(attribs, base=None):
if base is None:
return dict(attribs)
d = dict(attribs)
- for key, value in d.iteritems():
+ for key, value in d.items():
if key == 'href':
d[key] = full_href(value, base)
return d
@@ -248,7 +254,7 @@ def __init__(self, root, namespaces={}, version='rss20', encoding='utf-8', feed=
self.entries = entries
def clean(self, text):
- if text and isinstance(text, basestring):
+ if text and isinstance(text, str):
return self.cleaner.clean_html(text)
return text
@@ -504,10 +510,11 @@ def __init__(self, root, namespaces={}, encoding='utf-8', type='rss20', cleaner=
self.feed = feed
def clean(self, text, outer_tag=True):
- if text and isinstance(text, basestring):
- if not outer_tag:
- txt = self.cleaner.clean_html(text)
- frag = lxml.html.fragment_fromstring(txt)
+ if text and isinstance(text, str):
+ # txt and frag aren't used, this appears to be no-op code.
+ #if not outer_tag:
+ # txt = self.cleaner.clean_html(text)
+ # frag = lxml.html.fragment_fromstring(txt)
return self.cleaner.clean_html(text)
return text
@@ -589,7 +596,7 @@ def __init__(self, content, cleaner=default_cleaner, unix_timestamp=False, encod
if self.version in self.version_map:
self.version = self.version_map[self.version]
if 'unk' in self.version:
- raise IncompatibleFeedError("Could not determine version of this feed.")
+ raise IncompatibleFeedError(f"Could not determine version ({self.version}) of this feed.")
self.namespaces = self.parse_namespaces()
self.feed = self.parse_feed(self.version, self.encoding)
self.entries = self.parse_entries(self.version, self.encoding)
@@ -599,14 +606,17 @@ def parse_version(self):
root_ns, root_tag = clean_ns(r.tag)
root_tag = root_tag.lower()
vers = 'unk'
+ print(f"xmlns: {self.xmlns}")
if self.xmlns and self.xmlns.lower() in xmlns_map:
value = xmlns_map[self.xmlns.lower()]
if value == 'rss10' and root_tag == 'rss':
value = 'rss010'
if not (value.startswith('atom') and root_tag == 'rss'):
return value
- elif self.xmlns:
+ elif self.xmlns and len(self.xmlns.split('/')) > 2:
vers = self.xmlns.split('/')[-2].replace('.', '')
+ elif self.xmlns and len(self.xmlns.split(':')) > 2:
+ vers = self.xmlns.split(':')[-2].replace('.', '')
tag = root_tag
if r.attrib.get('version', None):
vers = r.attrib['version'].replace('.', '')