-
Notifications
You must be signed in to change notification settings - Fork 2
/
hn_scrape.py
236 lines (192 loc) · 6.54 KB
/
hn_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import multiprocessing
from tqdm import tqdm
import json
import math
import requests
SCORE_THRESH=1
COMMENT_SCORE_THRESH=-1
DESC_THRESH=1
PERCENT_TAKE = 0.5
HN_URL="https://hacker-news.firebaseio.com/v0/item/"
RANDOM_SEED=42
import multiprocessing.pool
from bs4 import BeautifulSoup
import html2text
html2text.BODY_WIDTH = 0
html2text.single_line_break = True
import lxml.html
from lxml import etree
import os.path
from os import path
import time
import urllib3
import ast
class NoDaemonProcess(multiprocessing.Process):
# make 'daemon' attribute always return False
def _get_daemon(self):
return False
def _set_daemon(self, value):
pass
daemon = property(_get_daemon, _set_daemon)
# We sub-class multiprocessing.pool.Pool instead of multiprocessing.Pool
# because the latter is only a wrapper function, not a proper class.
class MyPool(multiprocessing.pool.Pool):
Process = NoDaemonProcess
# Story Functions
def get_check_story(id):
#print(id)
if path.exists(''.join(["donev2/",str(id)])):
#print("already did" + ''.join(["donev2/",str(id)]))
return "already did"
with open(''.join(['donev2/',str(id)]),'w') as donefile:
donefile.write('w')
try:
item = json.loads(requests.get(''.join([HN_URL, str(id),'.json'])).text)
except :
time.sleep(2)
try:
item = json.loads(requests.get(''.join([HN_URL, str(id),'.json'])).text)
except requests.exceptions.SSLError:
return "dead"
try:
if item['dead'] == True:
return "dead"
pass
except TypeError:
return "nonexistant"
except KeyError:
pass
try:
if item['type'] != 'story':
#print('story')
return "not counted (not story)"
if item['score'] < SCORE_THRESH:
#print('score')
return "not counted (score too low)"
if item['descendants'] < DESC_THRESH:
#print('len')
return "not counted (num descenants too low)"
except TypeError:
#print('type')
return "not counted (typeerror)"
except KeyError:
#print('key')
return "not counted (keyerror)"
return parse_story(id,item)
def parse_story(id,item):
header = ''.join([item['title'],' - ',item['by'],'\n'])
try:
header = ''.join(['\n',header, item['url'], '\n'])
except KeyError:
pass
try:
header = ''.join(['\n',header, item['text'], '\n'])
except KeyError:
pass
header = ''.join([header, '======'])
comments = []
#start comment chains in parallel
# map the list of lines into a list of result dicts
commentpool = MyPool(12)
comments = commentpool.map(top_comment_parse,item['kids'])
commentpool.close()
comments = list(filter(None, comments))
# for comment in item['kids']:
## parsed_comment = top_comment_parse(comment)
## if parsed_comment != '':
## comments.append(parsed_comment)
comments = '------\n'.join(comments)
header = ''.join([header, '\n', comments])
with open(''.join(['datav2/',str(id)]),'w') as outfile:
outfile.write(header)
return header
# Comment Functions
def check_comment(item):
try:
if item['text'] == None:
return False
except KeyError:
return False
try:
if item['dead'] == True :
return False
except KeyError:
return True
return True
def choose_next_sub_comment(comment_list):
#take max descendant len
def get_max_len(item):
try:
return len(item['kids'])
except KeyError:
return 0
comment_list.sort(key= lambda x: get_max_len(x), reverse=True)
return comment_list[0]
#sort by score and take random one
comment_list.sort(key= lambda x: x['score'], reverse=True)
good_inds = math.ceil(PERCENT_TAKE * len(comment_list))
return random.choice(comment_list[:good_inds+1], seed=RANDOM_SEED)
def sub_comment_parse(item, comment_block):
# item['text'] = BeautifulSoup(item['text'], "lxml").get_text(separator="\n")
# item['text'] = lxml.html.fromstring(item['text']).text_content()
item['text'] = html2text.html2text(item['text'])
comment_block = ''.join([comment_block, item['by'],'\n',item['text']])
try:
if len(item['kids']) == 0:
return comment_block
except KeyError:
return comment_block
kids = []
for kid_id in item['kids']:
try:
kid = json.loads(requests.get(''.join([HN_URL, str(kid_id),'.json'])).text)
except:
time.sleep(2)
try:
kid = json.loads(requests.get(''.join([HN_URL, str(kid_id),'.json'])).text)
except requests.exceptions.SSLError:
continue
kids.append(kid)
with open(''.join(['donev2/',str(kid['id'])]),'w') as donefile:
donefile.write('w')
possible_routes = [kid for kid in kids if check_comment(kid)]
#might be dead
if len(possible_routes) == 0:
return comment_block
next_comment = choose_next_sub_comment(possible_routes)
comment_block = ''.join([comment_block, '~~~\n'])
comment_block = sub_comment_parse(next_comment, comment_block)
return comment_block
def top_comment_parse(id):
try:
item = json.loads(requests.get(''.join([HN_URL, str(id),'.json'])).text)
except:
time.sleep(2)
try:
item = json.loads(requests.get(''.join([HN_URL, str(id),'.json'])).text)
except requests.exceptions.SSLError:
return comment_block
with open(''.join(['donev2/',str(id)]),'w') as donefile:
donefile.write('w')
comment_block = ''
if check_comment(item):
comment_block = sub_comment_parse(item, comment_block)
#add top comment to the
#current_doc = ''.join([current_doc, item['by'],'\n',item['text']])
return comment_block
def main(end_id, num_threads):
# map the list of lines into a list of result dicts
pool = MyPool(12)
with open('story_list','r') as id_file:
indlist = id_file.readline().strip("]").strip("[").split(', ')
resultlist = list(tqdm(pool.imap(get_check_story, indlist), total=len(indlist)))
#resultlist = pool.map(get_check_story, indlist)
pool.close()
with open('resultlist','w') as result:
result.write(str(resultlist))
if __name__ == "__main__":
#get_check_story(200)
#print(main(2000, 8))
#print(main(5000000, 8))
#print(main(10000000, 8))
print(main(245317120, 8))