This repository was archived by the owner on Sep 1, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcrawl.py
More file actions
46 lines (40 loc) · 1.29 KB
/
crawl.py
File metadata and controls
46 lines (40 loc) · 1.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from __future__ import unicode_literals
import requests, urllib2, json
from bs4 import BeautifulSoup
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
q = []
def crawl(u):
if not u == None:
print u
r = requests.get('https://en.wikipedia.org/wiki/'+u).text
parsed_html = BeautifulSoup(r, "lxml")
text = h.handle(r).encode('utf-8')
f = open("data/www/"+ u +".txt","w")
f.write(text)
f.close()
for link in parsed_html.find_all('a'):
try:
n = str(link.get('href')).split('/')[-1].encode('utf-8').strip()
n = urllib2.unquote(n.encode('utf-8').strip()).encode('utf8')
if (not "." in n.decode('utf-8')):
if (not "#" in n.decode('utf-8')):
if (not ":" in n.decode('utf-8')):
if (not "%" in n.decode('utf-8')):
if (not "?" in n.decode('utf-8')):
q.append(str(n))
except UnicodeDecodeError:
return
except UnicodeEncodeError:
return
def dl():
while len(q) > 0:
o = q[0]
q.pop(0)
crawl(o)
if __name__ == "__main__":
print "Crawling Wikipedia..."
crawl('Carnegie_Mellon_University') # Seed page
dl()