forked from Zhiz0id/boilerpipepy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl2article.py
executable file
·59 lines (49 loc) · 1.64 KB
/
url2article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from boilerpipe.sax.BoilerpipeHTMLContentHandler import BoilerpipeHTMLContentHandler
import urllib2
from boilerpipe.extractors.ArticleExtractor import ArticleExtractor
from StringIO import StringIO
from lxml import etree, sax
import sys
import codecs
import os
from textwrap import TextWrapper
def main():
ugly = False
if os.sys.platform[0:3] == 'win':
ugly = True
response = urllib2.urlopen(sys.argv[1])
encoding = response.headers.getparam('charset')
html = response.read().decode(encoding)
f = StringIO(html)
parser = etree.HTMLParser()
#create SAX tree
tree = etree.parse(f, parser)
handler = BoilerpipeHTMLContentHandler()
sax.saxify(tree, handler)
a = ArticleExtractor()
#parses our data and creates TextDocument with TextBlocks
doc = handler.toTextDocument()
tw = TextWrapper()
tw.width = 80
tw.initial_indent = os.linesep + os.linesep
parsed_url = urllib2.urlparse.urlparse(sys.argv[1])
filename = parsed_url.netloc + "-" + "".join(
[c for c in parsed_url.path if c.isalpha() or c.isdigit() or c == ' ']
).rstrip() + '.txt'
output = []
for line in a.getText(doc).splitlines():
output.append(tw.fill(line))
i = 0
with codecs.open(filename, 'w', encoding='utf8') as f:
for line in output:
if ugly:
line.replace('\n', os.linesep)
f.write(line)
print "Article saved. Lines: %s. Filename: %s" % (len(output), filename)
if __name__ == "__main__":
if len(sys.argv) > 1:
main()
else:
print "Please run with url as a parameter"