Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
"""Setup script for speedparser."""

from setuptools import setup, find_packages
import sys, os

try:
from speedparser import VERSION
version = ".".join(map(str, VERSION))
except:
version = '0.2.0'
version = '0.2.1'

# some trove classifiers:

Expand Down Expand Up @@ -45,6 +44,7 @@
# 'feedparser>=0.5',
'lxml',
'chardet',
'future'
],
entry_points="""
# -*- Entry points: -*-
Expand Down
2 changes: 1 addition & 1 deletion speedparser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .speedparser import parse
VERSION = (0,2,0)
VERSION = (0,2,1)
__all__ = ['parse', 'VERSION']
44 changes: 27 additions & 17 deletions speedparser/speedparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

"""

from __future__ import absolute_import, division, print_function
from builtins import str
import re
import time
try:
Expand All @@ -27,7 +29,7 @@
try:
import feedparser
except:
import feedparsercompat as feedparser
from . import feedparsercompat as feedparser

keymap = feedparser.FeedParserDict.keymap
fpnamespaces = feedparser._FeedParserMixin.namespaces
Expand Down Expand Up @@ -95,7 +97,7 @@ def strip_outer_tag(text):
"""Strips the outer tag, if text starts with a tag. Not entity aware;
designed to quickly strip outer tags from lxml cleaner output. Only
checks for <p> and <div> outer tags."""
if not text or not isinstance(text, basestring):
if not text or not isinstance(text, str):
return text
stripped = text.strip()
if (stripped.startswith('<p>') or stripped.startswith('<div>')) and \
Expand All @@ -104,17 +106,21 @@ def strip_outer_tag(text):
return text

nsre = re.compile(r'xmlns\s*=\s*[\'"](.+?)[\'"]')
nsreb = re.compile(rb'xmlns\s*=\s*[\'"](.+?)[\'"]')


def strip_namespace(document):
if document[:1000].count('xmlns') > 5:
if 'xmlns' not in document[:1000]:
# convert our bytes to a unicode string so we can search and slice.
decoded = document.decode('utf8')
if decoded[:1000].count('xmlns') > 5:
if 'xmlns' not in decoded[:1000]:
return None, document
elif 'xmlns' not in document[:400]:
elif 'xmlns' not in decoded[:400]:
return None, document
match = nsre.search(document)
match = nsre.search(decoded)
if match:
return match.groups()[0], nsre.sub('', document)
print(f"mg type: {type(match.groups()[0])}; doc type: {type(document)}")
return match.groups()[0], nsreb.sub(rb'', document)
return None, document


Expand Down Expand Up @@ -145,13 +151,13 @@ def munge_author(author):

def reverse_namespace_map(nsmap):
d = fpnamespaces.copy()
d.update(dict([(v, k) for (k, v) in nsmap.iteritems()]))
d.update(dict([(v, k) for (k, v) in nsmap.items()]))
return d


def base_url(root):
"""Determine the base url for a root element."""
for attr, value in root.attrib.iteritems():
for attr, value in root.attrib.items():
if attr.endswith('base') and 'http' in value:
return value
return None
Expand All @@ -165,7 +171,7 @@ def full_href_attribs(attribs, base=None):
if base is None:
return dict(attribs)
d = dict(attribs)
for key, value in d.iteritems():
for key, value in d.items():
if key == 'href':
d[key] = full_href(value, base)
return d
Expand Down Expand Up @@ -248,7 +254,7 @@ def __init__(self, root, namespaces={}, version='rss20', encoding='utf-8', feed=
self.entries = entries

def clean(self, text):
if text and isinstance(text, basestring):
if text and isinstance(text, str):
return self.cleaner.clean_html(text)
return text

Expand Down Expand Up @@ -504,10 +510,11 @@ def __init__(self, root, namespaces={}, encoding='utf-8', type='rss20', cleaner=
self.feed = feed

def clean(self, text, outer_tag=True):
if text and isinstance(text, basestring):
if not outer_tag:
txt = self.cleaner.clean_html(text)
frag = lxml.html.fragment_fromstring(txt)
if text and isinstance(text, str):
# txt and frag aren't used, this appears to be no-op code.
#if not outer_tag:
# txt = self.cleaner.clean_html(text)
# frag = lxml.html.fragment_fromstring(txt)
return self.cleaner.clean_html(text)
return text

Expand Down Expand Up @@ -589,7 +596,7 @@ def __init__(self, content, cleaner=default_cleaner, unix_timestamp=False, encod
if self.version in self.version_map:
self.version = self.version_map[self.version]
if 'unk' in self.version:
raise IncompatibleFeedError("Could not determine version of this feed.")
raise IncompatibleFeedError(f"Could not determine version ({self.version}) of this feed.")
self.namespaces = self.parse_namespaces()
self.feed = self.parse_feed(self.version, self.encoding)
self.entries = self.parse_entries(self.version, self.encoding)
Expand All @@ -599,14 +606,17 @@ def parse_version(self):
root_ns, root_tag = clean_ns(r.tag)
root_tag = root_tag.lower()
vers = 'unk'
print(f"xmlns: {self.xmlns}")
if self.xmlns and self.xmlns.lower() in xmlns_map:
value = xmlns_map[self.xmlns.lower()]
if value == 'rss10' and root_tag == 'rss':
value = 'rss010'
if not (value.startswith('atom') and root_tag == 'rss'):
return value
elif self.xmlns:
elif self.xmlns and len(self.xmlns.split('/')) > 2:
vers = self.xmlns.split('/')[-2].replace('.', '')
elif self.xmlns and len(self.xmlns.split(':')) > 2:
vers = self.xmlns.split(':')[-2].replace('.', '')
tag = root_tag
if r.attrib.get('version', None):
vers = r.attrib['version'].replace('.', '')
Expand Down