Skip to content

Commit

Permalink
Delete main page trees after links extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
schedutron committed Feb 22, 2019
1 parent 383da15 commit ec876f4
Showing 1 changed file with 8 additions and 12 deletions.
20 changes: 8 additions & 12 deletions chirps/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def scrape_themerkle(num_pages=17):
tree = fromstring(r.content)
collection = tree.xpath("//h2[@class='title front-view-title']/a/@href")
links.extend(collection)
del tree

for link in links:
r = requests.get(link)
Expand All @@ -180,6 +181,8 @@ def scrape_udacity():
url = 'https://blog.udacity.com/%s/%s' % (now.year, now.month)
r = requests.get(url, headers=HEADERS)
tree = fromstring(r.content)
del tree

links = tree.xpath('//div[@class="entry-content"]/p[last()]/a/@href')
for link in links:
r = requests.get(link, headers=HEADERS)
Expand All @@ -198,6 +201,7 @@ def scrape_coursera():
r = requests.get(url, headers=HEADERS)
tree = fromstring(r.content)
links = tree.xpath('//div[@class="recent"]//div[@class="title"]/a/@href')
del tree

for link in links:
r = requests.get(link, headers=HEADERS)
Expand Down Expand Up @@ -239,23 +243,15 @@ def scrape_thenewstack():

tree = fromstring(r.content)
links = tree.xpath('//div[@class="normalstory-box"]/header/h2/a/@href')
del tree

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
for link in links:
r = requests.get(link, verify=False)
tree = fromstring(r.content)
paras = tree.xpath('//div[@class="post-content"]/p')
paras = [para.text_content() for para in paras if para.text_content()]
para = random.choice(paras)
para = tokenizer.tokenize(para)
# To fix unicode issues:
para = [unicodedata.normalize('NFKD', text) for text in para]
for i in range(10):
text = random.choice(para)
if text and 60 < len(text) < 210:
break
else:
continue
para = extract_paratext(paras)
text = extract_text(para)

yield '"%s" %s' % (text, link)


Expand Down

0 comments on commit ec876f4

Please sign in to comment.