hn_scrape.py

import multiprocessing 
from tqdm import tqdm
import json
import math
import requests
SCORE_THRESH=1
COMMENT_SCORE_THRESH=-1
DESC_THRESH=1
PERCENT_TAKE = 0.5
HN_URL="https://hacker-news.firebaseio.com/v0/item/"
RANDOM_SEED=42
import multiprocessing.pool
from bs4 import BeautifulSoup
import html2text
html2text.BODY_WIDTH = 0
html2text.single_line_break = True
import lxml.html
from lxml import etree
import os.path
from os import path
import time
import urllib3
import ast
class NoDaemonProcess(multiprocessing.Process):
    # make 'daemon' attribute always return False
    def _get_daemon(self):
        return False
    def _set_daemon(self, value):
        pass
    daemon = property(_get_daemon, _set_daemon)

# We sub-class multiprocessing.pool.Pool instead of multiprocessing.Pool
# because the latter is only a wrapper function, not a proper class.
class MyPool(multiprocessing.pool.Pool):
    Process = NoDaemonProcess


# Story Functions

def get_check_story(id):
    #print(id)
    if path.exists(''.join(["donev2/",str(id)])):
        #print("already did" + ''.join(["donev2/",str(id)]))
        return "already did"
    
    with open(''.join(['donev2/',str(id)]),'w') as donefile:
        donefile.write('w')

    try:
       item = json.loads(requests.get(''.join([HN_URL, str(id),'.json'])).text) 
    except : 
        time.sleep(2)
        try:
            item = json.loads(requests.get(''.join([HN_URL, str(id),'.json'])).text) 
        except requests.exceptions.SSLError:
            return "dead"


    try:
        if item['dead'] == True:
            return "dead"
        pass
    except TypeError:
        return "nonexistant"
    except KeyError:
            pass
    try:
        if item['type'] != 'story': 
            #print('story')
            return "not counted (not story)"
        if item['score'] < SCORE_THRESH:
            #print('score')
            return "not counted (score too low)"
        if item['descendants'] < DESC_THRESH:
            #print('len')
            return "not counted (num descenants too low)"
    except TypeError:
        #print('type')
        return "not counted (typeerror)" 
    except KeyError:
        #print('key')
        return "not counted (keyerror)"
    return parse_story(id,item)


def parse_story(id,item):
    header = ''.join([item['title'],' - ',item['by'],'\n'])
    try:
        header = ''.join(['\n',header, item['url'], '\n'])
    except KeyError:
        pass
    try:
        header = ''.join(['\n',header, item['text'], '\n'])
    except KeyError:
        pass
    header = ''.join([header, '======'])

    
    comments = []
   
    #start comment chains in parallel
    # map the list of lines into a list of result dicts
    commentpool = MyPool(12)
    comments = commentpool.map(top_comment_parse,item['kids'])
    commentpool.close()
    comments = list(filter(None, comments)) 
   # for comment in item['kids']:
   ##     parsed_comment = top_comment_parse(comment)
   ##     if parsed_comment != '':
   ##         comments.append(parsed_comment)
    comments = '------\n'.join(comments)
    header = ''.join([header, '\n', comments])


    with open(''.join(['datav2/',str(id)]),'w') as outfile:
        outfile.write(header)
    return header


# Comment Functions

def check_comment(item):
    try:
        if item['text'] == None:
            return False
    except KeyError:
        return False
    try:
        if item['dead'] == True :
            return False
    except KeyError:
        return True
    return True


def choose_next_sub_comment(comment_list):

    
    #take max descendant len
    def get_max_len(item):
        try:
            return len(item['kids'])
        except KeyError:
            return 0
    
    comment_list.sort(key= lambda x: get_max_len(x), reverse=True)
    return comment_list[0]

    #sort by score and take random one
    comment_list.sort(key= lambda x: x['score'], reverse=True)
    good_inds = math.ceil(PERCENT_TAKE * len(comment_list))
    return random.choice(comment_list[:good_inds+1], seed=RANDOM_SEED)

def sub_comment_parse(item, comment_block):
#    item['text'] = BeautifulSoup(item['text'], "lxml").get_text(separator="\n")
#    item['text'] = lxml.html.fromstring(item['text']).text_content()
    item['text'] = html2text.html2text(item['text'])
    comment_block = ''.join([comment_block, item['by'],'\n',item['text']])
    try:
        if len(item['kids']) == 0:
            return comment_block
    except KeyError:
        return comment_block 

    kids = []
    for kid_id in item['kids']:
        try:
            kid = json.loads(requests.get(''.join([HN_URL, str(kid_id),'.json'])).text)
        except: 
            time.sleep(2)
            try:
                kid = json.loads(requests.get(''.join([HN_URL, str(kid_id),'.json'])).text)
            except requests.exceptions.SSLError:
                continue
        kids.append(kid)
        with open(''.join(['donev2/',str(kid['id'])]),'w') as donefile:
            donefile.write('w')
    possible_routes = [kid for kid in kids if check_comment(kid)]
    #might be dead
    if len(possible_routes) == 0:
        return comment_block

    next_comment = choose_next_sub_comment(possible_routes)
    comment_block = ''.join([comment_block, '~~~\n'])
    comment_block = sub_comment_parse(next_comment, comment_block)
    return comment_block

def top_comment_parse(id):
    try:
       item = json.loads(requests.get(''.join([HN_URL, str(id),'.json'])).text) 
    except: 
        time.sleep(2)
        try:
            item = json.loads(requests.get(''.join([HN_URL, str(id),'.json'])).text) 
        except requests.exceptions.SSLError:
            return comment_block

    with open(''.join(['donev2/',str(id)]),'w') as donefile:
        donefile.write('w')
    
    comment_block = ''
    if check_comment(item):
        comment_block = sub_comment_parse(item, comment_block)
        #add top comment to the 
        #current_doc = ''.join([current_doc, item['by'],'\n',item['text']])
    return comment_block

        
def main(end_id, num_threads):
    # map the list of lines into a list of result dicts
    pool = MyPool(12)
    
    with open('story_list','r') as id_file:
        indlist = id_file.readline().strip("]").strip("[").split(', ')
    resultlist = list(tqdm(pool.imap(get_check_story, indlist), total=len(indlist)))
    #resultlist = pool.map(get_check_story, indlist)
    pool.close()
    with open('resultlist','w') as result:
        result.write(str(resultlist))


if __name__ == "__main__":
    #get_check_story(200)
    #print(main(2000, 8))
    #print(main(5000000, 8))
    #print(main(10000000, 8))
    print(main(245317120, 8))