stackoverflow_api_recommender/parser.py at master · Riya-11/stackoverflow_api_recommender · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from bs4 import BeautifulSoup

import log
import models
import utils


def parse_answer(answer_soup: BeautifulSoup):
    answer_content_soup = answer_soup.find('div', attrs={'class': 'post-text'})
    answer_upvote_count_soup = answer_soup.find('div', attrs={'itemprop': 'upvoteCount'})
    upvotes = 0
    if answer_upvote_count_soup:
        upvotes = int(answer_upvote_count_soup['data-value'])
    else:
        log.log(f" Failed to find upvote count", module="parser")

    outlinks_soup = answer_content_soup.findAll('a', href=True)
    outlinks = [a['href'] for a in outlinks_soup]

    code_snippets_soups = answer_content_soup.findAll('code')
    code_snippets_texts = [code.text for code in code_snippets_soups]

    # remove all code tags from text
    [code_soup.replaceWith('') for code_soup in code_snippets_soups]

    answer_text_content = answer_content_soup.text
    answer_object = models.Answer(url="", text=answer_text_content, code_snippets=code_snippets_texts,
                                  out_links=outlinks, upvotes=upvotes, lang="java")

    return answer_object


def parse_question(page_soup: BeautifulSoup):
    qtitle = page_soup.find('title').text
    question_block = page_soup.find('div', attrs={'class': 'question'})
    question_id = question_block['data-questionid']
    question_content_soup = question_block.find('div', attrs={'class': 'post-text'})
    # print(qcontent)

    question_code_snippets = question_content_soup.findAll('code')
    question_code_snippets_text = [code.text for code in question_code_snippets]

    # remove all code tags from text
    [qct.replaceWith('') for qct in question_code_snippets]

    outlinks_soups = question_content_soup.findAll('a', href=True)
    outlinks = [a['href'] for a in outlinks_soups]

    question_text = question_content_soup.text  # this text wont contain code blocks

    answer_soups = page_soup.findAll('div', attrs={'class': 'answer'})
    answers = [parse_answer(ans) for ans in answer_soups]
    answers.sort(key=lambda a: a.upvotes, reverse=True)

    question_object = models.Question(url="", title=qtitle, question_id=question_id, text=question_text,
                                      code_snippets=question_code_snippets_text,
                                      out_links=outlinks, answers=answers, lang="java")

    return question_object


def parse_question_from_file(file):
    f = open(file, mode="r")
    q2_raw = f.read()
    f.close()

    soup2 = BeautifulSoup(q2_raw, 'html.parser')

    question = parse_question(soup2)

    log.success(f" question {question.id} parsed", module="parser")
    return question


if __name__ == "__main__":
    file = "dataset/questions/raw/using-filechannel-to-write-any-inputstream?.html"
    q = parse_question_from_file(file)
    print(q.to_json())